implementation of cross comparison for different fields, addition of clustering mechanism to collapse keys from different clustering functions on the same cluster

This commit is contained in:
miconis 2021-05-03 15:37:41 +02:00
parent ed0d5d3e1d
commit 4988e9f80d
6 changed files with 148 additions and 7 deletions

View File

@ -1,7 +1,9 @@
package eu.dnetlib.pace.clustering;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
@ -9,9 +11,14 @@ import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.apache.commons.lang3.StringUtils;
public class ClusteringCombiner {
private static String SEPARATOR = ":";
private static String COLLAPSE_ON= "collapseOn";
public static Collection<String> combine(final Document a, final Config conf) {
return new ClusteringCombiner().doCombine(a, conf);
}
@ -20,10 +27,38 @@ public class ClusteringCombiner {
final Collection<String> res = Sets.newLinkedHashSet();
for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) {
final Field values = a.values(fieldName);
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
String prefix = getPrefix(cd, fieldName);
Field values = a.values(fieldName);
List<Field> fields = new ArrayList<>();
if (values instanceof FieldValueImpl) {
fields.add(values);
}
else {
fields.addAll((List<Field>) values);
}
res.addAll(
cd.clusteringFunction()
.apply(conf, fields)
.stream()
.map(k -> prefix + SEPARATOR +k)
.collect(Collectors.toList())
);
}
}
return res;
}
private String getPrefix(ClusteringDef cd, String fieldName) {
return cd.getName()+ SEPARATOR +
cd.getParams().keySet()
.stream()
.filter(k -> k.contains(COLLAPSE_ON))
.findFirst()
.map(k -> StringUtils.substringAfter(k, SEPARATOR))
.orElse(fieldName);
}
}

View File

@ -5,6 +5,7 @@ import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
@ -12,6 +13,8 @@ import java.util.List;
public class TreeNodeDef implements Serializable {
final static String CROSS_COMPARE = "crossCompare";
private List<FieldConf> fields;
private AggType aggregation;
@ -45,7 +48,17 @@ public class TreeNodeDef implements Serializable {
double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
double result;
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
result = Math.max(result1,result2);
}
else
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),

View File

@ -175,5 +175,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
}
}

View File

@ -2,16 +2,24 @@ package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.tree.support.AggType;
import eu.dnetlib.pace.tree.support.FieldConf;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.tree.support.TreeNodeStats;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -20,13 +28,14 @@ import java.util.stream.Collectors;
public class ConfigTest extends AbstractPaceTest {
private Map<String, String> params;
private static Map<String, String> params;
@BeforeAll
public void setup() {
public static void setup() {
params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
}
@Test
@ -102,4 +111,37 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
}
@Test
public void clusteringCombinerTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
final String json = readFromClasspath("publication.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]);
assertEquals("test", combine[0].split(":")[1]);
assertEquals("title", combine[1].split(":")[1]);
assertEquals("doi", combine[2].split(":")[1]);
}
@Test
public void crossCompareTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json"));
TreeNodeDef treeNode = dedupConf.decisionTree().get("start");
final String json = readFromClasspath("organization.json");
final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf);
assertTrue(nodeStats.getFinalScore(AggType.MAX)>0.7);
}
}

View File

@ -0,0 +1,52 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"maxIterations": "20"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"decisionTree" : {
"start": {
"fields": [
{
"field": "legalname",
"comparator": "levenstein",
"weight": 1,
"countIfUndefined": "false",
"params": {
"crossCompare" : "legalshortname"
}
}
],
"threshold": 1,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {}
}
}

View File

@ -29,7 +29,7 @@
},
"pace": {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3", "collapseOn:test": ""} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],