Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
8 changed files with 22 additions and 65 deletions
Showing only changes of commit dc41b76643 - Show all commits

Binary file not shown.

View File

@ -21,7 +21,7 @@ public class ClusteringCombiner {
for (final ClusteringDef cd : defs) { for (final ClusteringDef cd : defs) {
for (final String fieldName : cd.getFields()) { for (final String fieldName : cd.getFields()) {
final Field values = a.values(fieldName); final Field values = a.values(fieldName);
res.addAll(cd.getClusteringFunction().apply((List<Field>) values)); res.addAll(cd.clusteringFunction().apply((List<Field>) values));
} }
} }
return res; return res;

View File

@ -13,6 +13,7 @@ import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceResolver; import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
public class PaceConfig implements Serializable { public class PaceConfig implements Serializable {
@ -57,10 +58,12 @@ public class PaceConfig implements Serializable {
return conditions; return conditions;
} }
@JsonIgnore
public List<ConditionAlgo> getConditionAlgos() { public List<ConditionAlgo> getConditionAlgos() {
return asConditionAlgos(getConditions()); return asConditionAlgos(getConditions());
} }
@JsonIgnore
public List<ConditionAlgo> getStrictConditionAlgos() { public List<ConditionAlgo> getStrictConditionAlgos() {
return asConditionAlgos(getStrictConditions()); return asConditionAlgos(getStrictConditions());
} }
@ -102,7 +105,7 @@ public class PaceConfig implements Serializable {
final List<FieldDef> fields = getModel().stream() final List<FieldDef> fields = getModel().stream()
.filter(fd -> cd.getFields().contains(fd.getName())) .filter(fd -> cd.getFields().contains(fd.getName()))
.collect(Collectors.toList()); .collect(Collectors.toList());
algos.add(cd.getConditionAlgo(fields)); algos.add(cd.conditionAlgo(fields));
} }
return algos; return algos;
} }

View File

@ -77,7 +77,7 @@ public class DistanceScorer {
} }
} else { } else {
if (va.getType().equals(vb.getType())) { if (va.getType().equals(vb.getType())) {
de.setDistance(w * fd.getDistanceAlgo().distance(va, vb)); de.setDistance(w * fd.distanceAlgo().distance(va, vb));
} else { } else {
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
} }

View File

@ -28,7 +28,7 @@ public class ClusteringDef implements Serializable {
this.name = name; this.name = name;
} }
public ClusteringFunction getClusteringFunction() { public ClusteringFunction clusteringFunction() {
try { try {
return PaceConfig.paceResolver.getClusteringFunction(getName(), params); return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
} catch (PaceException e) { } catch (PaceException e) {

View File

@ -16,7 +16,7 @@ public class CondDef implements Serializable {
public CondDef() {} public CondDef() {}
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){ public ConditionAlgo conditionAlgo(final List<FieldDef> fields){
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields); return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
} }

View File

@ -68,7 +68,7 @@ public class FieldDef implements Serializable {
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
} }
public DistanceAlgo getDistanceAlgo() { public DistanceAlgo distanceAlgo() {
if (params == null) { if (params == null) {
params = new HashMap<>(); params = new HashMap<>();

View File

@ -14,69 +14,23 @@ import static org.junit.Assert.assertNotNull;
public class ConfigTest extends AbstractPaceTest { public class ConfigTest extends AbstractPaceTest {
@Test @Test
public void dedupConfigSerializationTest() throws IOException { public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json")); final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
final String conf = cfgFromClasspath.toString();
// System.out.println("*****SERIALIZED*****");
// System.out.println(conf);
// System.out.println("*****FROM CLASSPATH*****");
// System.out.println(readFromClasspath("result.pace.conf.json"));
final DedupConfig cfgFromSerialization = DedupConfig.load(conf);
assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString());
assertNotNull(cfgFromClasspath); assertNotNull(cfgFromClasspath);
assertNotNull(cfgFromSerialization);
String conf = "{ \n" +
"wf\" : { " +
" \"threshold\" : \"0.99\", " +
" \"run\" : \"001\", " +
" \"entityType\" : \"result\", " +
" \"orderField\" : \"title\", " +
" \"queueMaxSize\" : \"2000\"," +
" \"groupMaxSize\" : \"10\"," +
" \"slidingWindowSize\" : \"200\"," +
" \"rootBuilder\" : [ \"result\" ]," +
" \"includeChildren\" : \"true\" " +
" }," +
"\t\"pace\" : {\t\t\n" +
"\t\t\"clustering\" : [\n" +
"\t\t\t{ \"name\" : \"acronyms\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"minLen\" : \"2\", \"maxLen\" : \"4\"} },\n" +
"\t\t\t{ \"name\" : \"ngrampairs\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"ngramLen\" : \"3\"} },\n" +
"\t\t\t{ \"name\" : \"suffixprefix\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"len\" : \"3\" } } \n" +
"\t\t],\t\t\n" +
"\t\t\"strictConditions\" : [\n" +
" \t\t\t{ \"name\" : \"exactMatch\", \"fields\" : [ \"pid\" ] }\n" +
" \t\t], \n" +
" \t\t\"conditions\" : [ \n" +
" \t\t\t{ \"name\" : \"yearMatch\", \"fields\" : [ \"dateofacceptance\" ] },\n" +
" \t\t\t{ \"name\" : \"titleVersionMatch\", \"fields\" : [ \"title\" ] },\n" +
" \t\t\t{ \"name\" : \"sizeMatch\", \"fields\" : [ \"authors\" ] } \n" +
" \t\t],\t\t\n" +
"\t\t\"model\" : [\n" +
"\t\t\t{ \"name\" : \"pid\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"pid[qualifier#classid = {doi}]/value\", \"overrideMatch\" : \"true\" }, \t\n" +
"\t\t\t{ \"name\" : \"title\", \"algo\" : \"JaroWinkler\", \"type\" : \"String\", \"weight\" : \"1.0\", \"ignoreMissing\" : \"false\", \"path\" : \"result/metadata/title[qualifier#classid = {main title}]/value\" },\n" +
"\t\t\t{ \"name\" : \"dateofacceptance\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/metadata/dateofacceptance/value\" } ,\n" +
"\t\t\t{ \"name\" : \"authors\", \"algo\" : \"Null\", \"type\" : \"List\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/author/metadata/fullname/value\" }\n" +
"\t\t],\n" +
"\t\t\"blacklists\" : {\n" +
"\t\t\t\"title\" : [\n" +
"\t\t\t\t\"^(Corpus Oral Dialectal \\\\(COD\\\\)\\\\.).*$\",\n" +
"\t\t\t\t\"^(Kiri Karl Morgensternile).*$\",\n" +
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*\\\\]$\",\n" +
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*$\",\n" +
"\t\t\t\t\"^(Eksliibris Aleksandr).*$\",\n" +
"\t\t\t\t\"^(Kiri A\\\\. de Vignolles).*$\",\n" +
"\t\t\t\t\"^(2 kirja Karl Morgensternile).*$\",\n" +
"\t\t\t\t\"^(Pirita kloostri idaosa arheoloogilised).*$\",\n" +
"\t\t\t\t\"^(Kiri tundmatule).*$\",\n" +
"\t\t\t\t\"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$\",\n" +
"\t\t\t\t\"^(Eksliibris Nikolai Birukovile).*$\",\n" +
"\t\t\t\t\"^(Eksliibris Nikolai Issakovile).*$\",\n" +
"\t\t\t\t\"^(WHP Cruise Summary Information of section).*$\",\n" +
"\t\t\t\t\"^(Measurement of the top quark\\\\-pair production cross section with ATLAS in pp collisions at).*$\",\n" +
"\t\t\t\t\"^(Measurement of the spin\\\\-dependent structure function).*\"\n" +
"\t\t\t] } \t\t\n" +
"\t}\n" +
"\n" +
"}";
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
//verify if the serialization produces the same result of the input json
assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
} }