Master branch updates from beta September 2023 #337
Binary file not shown.
|
@ -21,7 +21,7 @@ public class ClusteringCombiner {
|
|||
for (final ClusteringDef cd : defs) {
|
||||
for (final String fieldName : cd.getFields()) {
|
||||
final Field values = a.values(fieldName);
|
||||
res.addAll(cd.getClusteringFunction().apply((List<Field>) values));
|
||||
res.addAll(cd.clusteringFunction().apply((List<Field>) values));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
|
|
@ -13,6 +13,7 @@ import eu.dnetlib.pace.model.CondDef;
|
|||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
public class PaceConfig implements Serializable {
|
||||
|
||||
|
@ -57,10 +58,12 @@ public class PaceConfig implements Serializable {
|
|||
return conditions;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getConditionAlgos() {
|
||||
return asConditionAlgos(getConditions());
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getStrictConditionAlgos() {
|
||||
return asConditionAlgos(getStrictConditions());
|
||||
}
|
||||
|
@ -102,7 +105,7 @@ public class PaceConfig implements Serializable {
|
|||
final List<FieldDef> fields = getModel().stream()
|
||||
.filter(fd -> cd.getFields().contains(fd.getName()))
|
||||
.collect(Collectors.toList());
|
||||
algos.add(cd.getConditionAlgo(fields));
|
||||
algos.add(cd.conditionAlgo(fields));
|
||||
}
|
||||
return algos;
|
||||
}
|
||||
|
|
|
@ -77,7 +77,7 @@ public class DistanceScorer {
|
|||
}
|
||||
} else {
|
||||
if (va.getType().equals(vb.getType())) {
|
||||
de.setDistance(w * fd.getDistanceAlgo().distance(va, vb));
|
||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ public class ClusteringDef implements Serializable {
|
|||
this.name = name;
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction() {
|
||||
public ClusteringFunction clusteringFunction() {
|
||||
try {
|
||||
return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
|
||||
} catch (PaceException e) {
|
||||
|
|
|
@ -16,7 +16,7 @@ public class CondDef implements Serializable {
|
|||
|
||||
public CondDef() {}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){
|
||||
public ConditionAlgo conditionAlgo(final List<FieldDef> fields){
|
||||
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ public class FieldDef implements Serializable {
|
|||
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
||||
}
|
||||
|
||||
public DistanceAlgo getDistanceAlgo() {
|
||||
public DistanceAlgo distanceAlgo() {
|
||||
|
||||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
|
|
|
@ -14,69 +14,23 @@ import static org.junit.Assert.assertNotNull;
|
|||
public class ConfigTest extends AbstractPaceTest {
|
||||
|
||||
@Test
|
||||
public void dedupConfigSerializationTest() throws IOException {
|
||||
public void dedupConfigSerializationTest() {
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
||||
|
||||
final String conf = cfgFromClasspath.toString();
|
||||
|
||||
// System.out.println("*****SERIALIZED*****");
|
||||
// System.out.println(conf);
|
||||
// System.out.println("*****FROM CLASSPATH*****");
|
||||
// System.out.println(readFromClasspath("result.pace.conf.json"));
|
||||
|
||||
final DedupConfig cfgFromSerialization = DedupConfig.load(conf);
|
||||
|
||||
assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString());
|
||||
|
||||
assertNotNull(cfgFromClasspath);
|
||||
assertNotNull(cfgFromSerialization);
|
||||
|
||||
String conf = "{ \n" +
|
||||
"wf\" : { " +
|
||||
" \"threshold\" : \"0.99\", " +
|
||||
" \"run\" : \"001\", " +
|
||||
" \"entityType\" : \"result\", " +
|
||||
" \"orderField\" : \"title\", " +
|
||||
" \"queueMaxSize\" : \"2000\"," +
|
||||
" \"groupMaxSize\" : \"10\"," +
|
||||
" \"slidingWindowSize\" : \"200\"," +
|
||||
" \"rootBuilder\" : [ \"result\" ]," +
|
||||
" \"includeChildren\" : \"true\" " +
|
||||
" }," +
|
||||
"\t\"pace\" : {\t\t\n" +
|
||||
"\t\t\"clustering\" : [\n" +
|
||||
"\t\t\t{ \"name\" : \"acronyms\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"minLen\" : \"2\", \"maxLen\" : \"4\"} },\n" +
|
||||
"\t\t\t{ \"name\" : \"ngrampairs\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"ngramLen\" : \"3\"} },\n" +
|
||||
"\t\t\t{ \"name\" : \"suffixprefix\", \"fields\" : [ \"title\" ], \"params\" : { \"max\" : \"1\", \"len\" : \"3\" } } \n" +
|
||||
"\t\t],\t\t\n" +
|
||||
"\t\t\"strictConditions\" : [\n" +
|
||||
" \t\t\t{ \"name\" : \"exactMatch\", \"fields\" : [ \"pid\" ] }\n" +
|
||||
" \t\t], \n" +
|
||||
" \t\t\"conditions\" : [ \n" +
|
||||
" \t\t\t{ \"name\" : \"yearMatch\", \"fields\" : [ \"dateofacceptance\" ] },\n" +
|
||||
" \t\t\t{ \"name\" : \"titleVersionMatch\", \"fields\" : [ \"title\" ] },\n" +
|
||||
" \t\t\t{ \"name\" : \"sizeMatch\", \"fields\" : [ \"authors\" ] } \n" +
|
||||
" \t\t],\t\t\n" +
|
||||
"\t\t\"model\" : [\n" +
|
||||
"\t\t\t{ \"name\" : \"pid\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"pid[qualifier#classid = {doi}]/value\", \"overrideMatch\" : \"true\" }, \t\n" +
|
||||
"\t\t\t{ \"name\" : \"title\", \"algo\" : \"JaroWinkler\", \"type\" : \"String\", \"weight\" : \"1.0\", \"ignoreMissing\" : \"false\", \"path\" : \"result/metadata/title[qualifier#classid = {main title}]/value\" },\n" +
|
||||
"\t\t\t{ \"name\" : \"dateofacceptance\", \"algo\" : \"Null\", \"type\" : \"String\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/metadata/dateofacceptance/value\" } ,\n" +
|
||||
"\t\t\t{ \"name\" : \"authors\", \"algo\" : \"Null\", \"type\" : \"List\", \"weight\" : \"0.0\", \"ignoreMissing\" : \"true\", \"path\" : \"result/author/metadata/fullname/value\" }\n" +
|
||||
"\t\t],\n" +
|
||||
"\t\t\"blacklists\" : {\n" +
|
||||
"\t\t\t\"title\" : [\n" +
|
||||
"\t\t\t\t\"^(Corpus Oral Dialectal \\\\(COD\\\\)\\\\.).*$\",\n" +
|
||||
"\t\t\t\t\"^(Kiri Karl Morgensternile).*$\",\n" +
|
||||
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*\\\\]$\",\n" +
|
||||
"\t\t\t\t\"^(\\\\[Eksliibris Aleksandr).*$\",\n" +
|
||||
"\t\t\t\t\"^(Eksliibris Aleksandr).*$\",\n" +
|
||||
"\t\t\t\t\"^(Kiri A\\\\. de Vignolles).*$\",\n" +
|
||||
"\t\t\t\t\"^(2 kirja Karl Morgensternile).*$\",\n" +
|
||||
"\t\t\t\t\"^(Pirita kloostri idaosa arheoloogilised).*$\",\n" +
|
||||
"\t\t\t\t\"^(Kiri tundmatule).*$\",\n" +
|
||||
"\t\t\t\t\"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$\",\n" +
|
||||
"\t\t\t\t\"^(Eksliibris Nikolai Birukovile).*$\",\n" +
|
||||
"\t\t\t\t\"^(Eksliibris Nikolai Issakovile).*$\",\n" +
|
||||
"\t\t\t\t\"^(WHP Cruise Summary Information of section).*$\",\n" +
|
||||
"\t\t\t\t\"^(Measurement of the top quark\\\\-pair production cross section with ATLAS in pp collisions at).*$\",\n" +
|
||||
"\t\t\t\t\"^(Measurement of the spin\\\\-dependent structure function).*\"\n" +
|
||||
"\t\t\t] } \t\t\n" +
|
||||
"\t}\n" +
|
||||
"\n" +
|
||||
"}";
|
||||
|
||||
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
|
||||
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
|
||||
//verify if the serialization produces the same result of the input json
|
||||
assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue