Master branch updates from beta September 2023 #337
Binary file not shown.
|
@ -21,7 +21,7 @@ public class ClusteringCombiner {
|
||||||
for (final ClusteringDef cd : defs) {
|
for (final ClusteringDef cd : defs) {
|
||||||
for (final String fieldName : cd.getFields()) {
|
for (final String fieldName : cd.getFields()) {
|
||||||
final Field values = a.values(fieldName);
|
final Field values = a.values(fieldName);
|
||||||
res.addAll(cd.getClusteringFunction().apply((List<Field>) values));
|
res.addAll(cd.clusteringFunction().apply((List<Field>) values));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -13,6 +13,7 @@ import eu.dnetlib.pace.model.CondDef;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
import eu.dnetlib.pace.util.PaceResolver;
|
import eu.dnetlib.pace.util.PaceResolver;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
|
||||||
public class PaceConfig implements Serializable {
|
public class PaceConfig implements Serializable {
|
||||||
|
|
||||||
|
@ -57,10 +58,12 @@ public class PaceConfig implements Serializable {
|
||||||
return conditions;
|
return conditions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
public List<ConditionAlgo> getConditionAlgos() {
|
public List<ConditionAlgo> getConditionAlgos() {
|
||||||
return asConditionAlgos(getConditions());
|
return asConditionAlgos(getConditions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
public List<ConditionAlgo> getStrictConditionAlgos() {
|
public List<ConditionAlgo> getStrictConditionAlgos() {
|
||||||
return asConditionAlgos(getStrictConditions());
|
return asConditionAlgos(getStrictConditions());
|
||||||
}
|
}
|
||||||
|
@ -102,7 +105,7 @@ public class PaceConfig implements Serializable {
|
||||||
final List<FieldDef> fields = getModel().stream()
|
final List<FieldDef> fields = getModel().stream()
|
||||||
.filter(fd -> cd.getFields().contains(fd.getName()))
|
.filter(fd -> cd.getFields().contains(fd.getName()))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
algos.add(cd.getConditionAlgo(fields));
|
algos.add(cd.conditionAlgo(fields));
|
||||||
}
|
}
|
||||||
return algos;
|
return algos;
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,7 @@ public class DistanceScorer {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (va.getType().equals(vb.getType())) {
|
if (va.getType().equals(vb.getType())) {
|
||||||
de.setDistance(w * fd.getDistanceAlgo().distance(va, vb));
|
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,7 @@ public class ClusteringDef implements Serializable {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ClusteringFunction getClusteringFunction() {
|
public ClusteringFunction clusteringFunction() {
|
||||||
try {
|
try {
|
||||||
return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
|
return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
|
||||||
} catch (PaceException e) {
|
} catch (PaceException e) {
|
||||||
|
|
|
@ -16,7 +16,7 @@ public class CondDef implements Serializable {
|
||||||
|
|
||||||
public CondDef() {}
|
public CondDef() {}
|
||||||
|
|
||||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){
|
public ConditionAlgo conditionAlgo(final List<FieldDef> fields){
|
||||||
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
|
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ public class FieldDef implements Serializable {
|
||||||
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public DistanceAlgo getDistanceAlgo() {
|
public DistanceAlgo distanceAlgo() {
|
||||||
|
|
||||||
if (params == null) {
|
if (params == null) {
|
||||||
params = new HashMap<>();
|
params = new HashMap<>();
|
||||||
|
|
|
@ -14,69 +14,23 @@ import static org.junit.Assert.assertNotNull;
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void dedupConfigSerializationTest() throws IOException {
|
public void dedupConfigSerializationTest() {
|
||||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
||||||
|
|
||||||
|
final String conf = cfgFromClasspath.toString();
|
||||||
|
|
||||||
|
// System.out.println("*****SERIALIZED*****");
|
||||||
|
// System.out.println(conf);
|
||||||
|
// System.out.println("*****FROM CLASSPATH*****");
|
||||||
|
// System.out.println(readFromClasspath("result.pace.conf.json"));
|
||||||
|
|
||||||
|
final DedupConfig cfgFromSerialization = DedupConfig.load(conf);
|
||||||
|
|
||||||
|
assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString());
|
||||||
|
|
||||||
assertNotNull(cfgFromClasspath);
|
assertNotNull(cfgFromClasspath);
|
||||||
|
assertNotNull(cfgFromSerialization);
|
||||||
|
|
||||||
String conf = "{ \n" +
|
|
||||||
"wf\" : { " +
|
|
||||||
" \"threshold\" : \"0.99\", " +
|
|
||||||
" \"run\" : \"001\", " +
|
|
||||||
" \"entityType\" : \"result\", " +
|
|
||||||
" \"orderField\" : \"title\", " +
|
|
||||||
" \"queueMaxSize\" : \"2000\"," +
|
|
||||||
" \"groupMaxSize\" : \"10\"," +
|
|
||||||
" \"slidingWindowSize\" : \"200\"," +
|
|
||||||
" \"rootBuilder\" : [ \"result\" ]," +
|
|
||||||
" \"includeChildren\" : \"true\" " +
|
|
||||||
" }," +
|
|
||||||
"\t\"pace\" : {\t\t\n" +
|
|
||||||
"\t\t\"clustering\" : [\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"acronyms\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"minLen\" : \"2\", \"maxLen\" : \"4\"} },\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"ngrampairs\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"ngramLen\" : \"3\"} },\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"suffixprefix\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"len\" : \"3\" } } \n" +
|
|
||||||
"\t\t],\t\t\n" +
|
|
||||||
"\t\t\"strictConditions\" : [\n" +
|
|
||||||
" \t\t\t{ \"name\" : \"exactMatch\", \"fields\" : [ \"pid\" ] }\n" +
|
|
||||||
" \t\t], \n" +
|
|
||||||
" \t\t\"conditions\" : [ \n" +
|
|
||||||
" \t\t\t{ \"name\" : \"yearMatch\", \"fields\" : [ \"dateofacceptance\" ] },\n" +
|
|
||||||
" \t\t\t{ \"name\" : \"titleVersionMatch\", \"fields\" : [ \"title\" ] },\n" +
|
|
||||||
" \t\t\t{ \"name\" : \"sizeMatch\", \"fields\" : [ \"authors\" ] } \n" +
|
|
||||||
" \t\t],\t\t\n" +
|
|
||||||
"\t\t\"model\" : [\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"pid\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"pid[qualifier#classid = {doi}]/value\", \"overrideMatch\" : \"true\" }, \t\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"title\", \"algo\" : \"JaroWinkler\", \"type\" : \"String\", \"weight\" : \"1.0\", \"ignoreMissing\" : \"false\", \"path\" : \"result/metadata/title[qualifier#classid = {main title}]/value\" },\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"dateofacceptance\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/metadata/dateofacceptance/value\" } ,\n" +
|
|
||||||
"\t\t\t{ \"name\" : \"authors\", \"algo\" : \"Null\", \"type\" : \"List\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/author/metadata/fullname/value\" }\n" +
|
|
||||||
"\t\t],\n" +
|
|
||||||
"\t\t\"blacklists\" : {\n" +
|
|
||||||
"\t\t\t\"title\" : [\n" +
|
|
||||||
"\t\t\t\t\"^(Corpus Oral Dialectal \\\\(COD\\\\)\\\\.).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Kiri Karl Morgensternile).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*\\\\]$\",\n" +
|
|
||||||
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Eksliibris Aleksandr).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Kiri A\\\\. de Vignolles).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(2 kirja Karl Morgensternile).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Pirita kloostri idaosa arheoloogilised).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Kiri tundmatule).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Eksliibris Nikolai Birukovile).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Eksliibris Nikolai Issakovile).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(WHP Cruise Summary Information of section).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Measurement of the top quark\\\\-pair production cross section with ATLAS in pp collisions at).*$\",\n" +
|
|
||||||
"\t\t\t\t\"^(Measurement of the spin\\\\-dependent structure function).*\"\n" +
|
|
||||||
"\t\t\t] } \t\t\n" +
|
|
||||||
"\t}\n" +
|
|
||||||
"\n" +
|
|
||||||
"}";
|
|
||||||
|
|
||||||
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
|
|
||||||
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
|
|
||||||
//verify if the serialization produces the same result of the input json
|
|
||||||
assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue