diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index 52859b4b8..77a6aa137 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -1,7 +1,9 @@ package eu.dnetlib.pace.clustering; +import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; import com.google.common.collect.Sets; @@ -9,9 +11,14 @@ import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldValueImpl; +import org.apache.commons.lang3.StringUtils; public class ClusteringCombiner { + private static String SEPARATOR = ":"; + private static String COLLAPSE_ON= "collapseOn"; + public static Collection combine(final Document a, final Config conf) { return new ClusteringCombiner().doCombine(a, conf); } @@ -20,10 +27,38 @@ public class ClusteringCombiner { final Collection res = Sets.newLinkedHashSet(); for (final ClusteringDef cd : conf.clusterings()) { for (final String fieldName : cd.getFields()) { - final Field values = a.values(fieldName); - res.addAll(cd.clusteringFunction().apply(conf, (List) values)); + String prefix = getPrefix(cd, fieldName); + + Field values = a.values(fieldName); + List fields = new ArrayList<>(); + + if (values instanceof FieldValueImpl) { + fields.add(values); + } + else { + fields.addAll((List) values); + } + + res.addAll( + cd.clusteringFunction() + .apply(conf, fields) + .stream() + .map(k -> prefix + SEPARATOR +k) + .collect(Collectors.toList()) + ); } } return res; } + + private String getPrefix(ClusteringDef cd, String fieldName) { + return cd.getName()+ SEPARATOR + + cd.getParams().keySet() + .stream() + .filter(k -> k.contains(COLLAPSE_ON)) + .findFirst() + .map(k -> StringUtils.substringAfter(k, SEPARATOR)) + .orElse(fieldName); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index b6e27fa21..43b3a9276 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -5,6 +5,7 @@ import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.Serializable; @@ -12,6 +13,8 @@ import java.util.List; public class TreeNodeDef implements Serializable { + final static String CROSS_COMPARE = "crossCompare"; + private List fields; private AggType aggregation; @@ -45,7 +48,17 @@ public class TreeNodeDef implements Serializable { double weight = fieldConf.getWeight(); - double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); + double result; + + //if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum + if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) { + String crossField = fieldConf.getParams().get(CROSS_COMPARE); + double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf); + double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); + result = Math.max(result1,result2); + } + else + result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); stats.addFieldStats( fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 1b4f6f0eb..91a327474 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -175,5 +175,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(cf.apply(conf, Lists.newArrayList(title(s5)))); } - } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 05a8636ac..dc7b11a8a 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -2,16 +2,24 @@ package eu.dnetlib.pace.config; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.clustering.ClusteringClass; +import eu.dnetlib.pace.clustering.ClusteringCombiner; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.JsonListMatch; +import eu.dnetlib.pace.tree.support.AggType; +import eu.dnetlib.pace.tree.support.FieldConf; +import eu.dnetlib.pace.tree.support.TreeNodeDef; +import eu.dnetlib.pace.tree.support.TreeNodeStats; import eu.dnetlib.pace.util.MapDocumentUtil; import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -20,13 +28,14 @@ import java.util.stream.Collectors; public class ConfigTest extends AbstractPaceTest { - private Map params; + private static Map params; @BeforeAll - public void setup() { + public static void setup() { params = new HashMap<>(); params.put("jpath_value", "$.value"); params.put("jpath_classid", "$.qualifier.classid"); + } @Test @@ -102,4 +111,37 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); } + + @Test + public void clusteringCombinerTest() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); + + final String json = readFromClasspath("publication.json"); + + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]); + + assertEquals("test", combine[0].split(":")[1]); + assertEquals("title", combine[1].split(":")[1]); + assertEquals("doi", combine[2].split(":")[1]); + } + + @Test + public void crossCompareTest() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json")); + + TreeNodeDef treeNode = dedupConf.decisionTree().get("start"); + + final String json = readFromClasspath("organization.json"); + + final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf); + + assertTrue(nodeStats.getFinalScore(AggType.MAX)>0.7); + + } } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.cross.compare.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.cross.compare.conf.json new file mode 100644 index 000000000..e6845aeca --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.cross.compare.conf.json @@ -0,0 +1,52 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "idPath":"$.id", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true", + "maxIterations": "20" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "decisionTree" : { + "start": { + "fields": [ + { + "field": "legalname", + "comparator": "levenstein", + "weight": 1, + "countIfUndefined": "false", + "params": { + "crossCompare" : "legalshortname" + } + } + ], + "threshold": 1, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "false" + } + }, + "model" : [ + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" } + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json index 78a3b4e44..267ed458a 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json @@ -29,7 +29,7 @@ }, "pace": { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3", "collapseOn:test": ""} }, { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ],