forked from D-Net/dnet-hadoop
implementation of cross comparison for different fields, addition of clustering mechanism to collapse keys from different clustering functions on the same cluster
This commit is contained in:
parent
ed0d5d3e1d
commit
4988e9f80d
|
@ -1,7 +1,9 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
@ -9,9 +11,14 @@ import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.ClusteringDef;
|
import eu.dnetlib.pace.model.ClusteringDef;
|
||||||
import eu.dnetlib.pace.model.Document;
|
import eu.dnetlib.pace.model.Document;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
public class ClusteringCombiner {
|
public class ClusteringCombiner {
|
||||||
|
|
||||||
|
private static String SEPARATOR = ":";
|
||||||
|
private static String COLLAPSE_ON= "collapseOn";
|
||||||
|
|
||||||
public static Collection<String> combine(final Document a, final Config conf) {
|
public static Collection<String> combine(final Document a, final Config conf) {
|
||||||
return new ClusteringCombiner().doCombine(a, conf);
|
return new ClusteringCombiner().doCombine(a, conf);
|
||||||
}
|
}
|
||||||
|
@ -20,10 +27,38 @@ public class ClusteringCombiner {
|
||||||
final Collection<String> res = Sets.newLinkedHashSet();
|
final Collection<String> res = Sets.newLinkedHashSet();
|
||||||
for (final ClusteringDef cd : conf.clusterings()) {
|
for (final ClusteringDef cd : conf.clusterings()) {
|
||||||
for (final String fieldName : cd.getFields()) {
|
for (final String fieldName : cd.getFields()) {
|
||||||
final Field values = a.values(fieldName);
|
String prefix = getPrefix(cd, fieldName);
|
||||||
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
|
|
||||||
|
Field values = a.values(fieldName);
|
||||||
|
List<Field> fields = new ArrayList<>();
|
||||||
|
|
||||||
|
if (values instanceof FieldValueImpl) {
|
||||||
|
fields.add(values);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fields.addAll((List<Field>) values);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.addAll(
|
||||||
|
cd.clusteringFunction()
|
||||||
|
.apply(conf, fields)
|
||||||
|
.stream()
|
||||||
|
.map(k -> prefix + SEPARATOR +k)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getPrefix(ClusteringDef cd, String fieldName) {
|
||||||
|
return cd.getName()+ SEPARATOR +
|
||||||
|
cd.getParams().keySet()
|
||||||
|
.stream()
|
||||||
|
.filter(k -> k.contains(COLLAPSE_ON))
|
||||||
|
.findFirst()
|
||||||
|
.map(k -> StringUtils.substringAfter(k, SEPARATOR))
|
||||||
|
.orElse(fieldName);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.PaceConfig;
|
import eu.dnetlib.pace.config.PaceConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -12,6 +13,8 @@ import java.util.List;
|
||||||
|
|
||||||
public class TreeNodeDef implements Serializable {
|
public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
|
final static String CROSS_COMPARE = "crossCompare";
|
||||||
|
|
||||||
private List<FieldConf> fields;
|
private List<FieldConf> fields;
|
||||||
private AggType aggregation;
|
private AggType aggregation;
|
||||||
|
|
||||||
|
@ -45,7 +48,17 @@ public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
double weight = fieldConf.getWeight();
|
double weight = fieldConf.getWeight();
|
||||||
|
|
||||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
double result;
|
||||||
|
|
||||||
|
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
|
||||||
|
if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
|
||||||
|
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||||
|
double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
|
||||||
|
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
result = Math.max(result1,result2);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
|
||||||
stats.addFieldStats(
|
stats.addFieldStats(
|
||||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||||
|
|
|
@ -175,5 +175,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,16 +2,24 @@ package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
|
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||||
|
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||||
|
import eu.dnetlib.pace.tree.support.AggType;
|
||||||
|
import eu.dnetlib.pace.tree.support.FieldConf;
|
||||||
|
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||||
|
import eu.dnetlib.pace.tree.support.TreeNodeStats;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -20,13 +28,14 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private static Map<String, String> params;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public void setup() {
|
public static void setup() {
|
||||||
params = new HashMap<>();
|
params = new HashMap<>();
|
||||||
params.put("jpath_value", "$.value");
|
params.put("jpath_value", "$.value");
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -102,4 +111,37 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void clusteringCombinerTest() {
|
||||||
|
|
||||||
|
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||||
|
|
||||||
|
final String json = readFromClasspath("publication.json");
|
||||||
|
|
||||||
|
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||||
|
|
||||||
|
String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]);
|
||||||
|
|
||||||
|
assertEquals("test", combine[0].split(":")[1]);
|
||||||
|
assertEquals("title", combine[1].split(":")[1]);
|
||||||
|
assertEquals("doi", combine[2].split(":")[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void crossCompareTest() {
|
||||||
|
|
||||||
|
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json"));
|
||||||
|
|
||||||
|
TreeNodeDef treeNode = dedupConf.decisionTree().get("start");
|
||||||
|
|
||||||
|
final String json = readFromClasspath("organization.json");
|
||||||
|
|
||||||
|
final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||||
|
|
||||||
|
TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf);
|
||||||
|
|
||||||
|
assertTrue(nodeStats.getFinalScore(AggType.MAX)>0.7);
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
{
|
||||||
|
"wf" : {
|
||||||
|
"threshold" : "0.99",
|
||||||
|
"dedupRun" : "001",
|
||||||
|
"entityType" : "organization",
|
||||||
|
"orderField" : "legalname",
|
||||||
|
"queueMaxSize" : "2000",
|
||||||
|
"groupMaxSize" : "50",
|
||||||
|
"slidingWindowSize" : "200",
|
||||||
|
"idPath":"$.id",
|
||||||
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
|
"includeChildren" : "true",
|
||||||
|
"maxIterations": "20"
|
||||||
|
},
|
||||||
|
"pace" : {
|
||||||
|
"clustering" : [
|
||||||
|
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
|
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||||
|
],
|
||||||
|
"decisionTree" : {
|
||||||
|
"start": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "levenstein",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
"crossCompare" : "legalshortname"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"model" : [
|
||||||
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
||||||
|
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }
|
||||||
|
],
|
||||||
|
"blacklists" : {
|
||||||
|
"legalname" : []
|
||||||
|
},
|
||||||
|
"synonyms": {}
|
||||||
|
}
|
||||||
|
}
|
|
@ -29,7 +29,7 @@
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3", "collapseOn:test": ""} },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
],
|
],
|
||||||
|
|
Loading…
Reference in New Issue