forked from D-Net/dnet-hadoop
implementation of cross comparison for different fields, addition of clustering mechanism to collapse keys from different clustering functions on the same cluster
This commit is contained in:
parent
ed0d5d3e1d
commit
4988e9f80d
|
@ -1,7 +1,9 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
|
@ -9,9 +11,14 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class ClusteringCombiner {
|
||||
|
||||
private static String SEPARATOR = ":";
|
||||
private static String COLLAPSE_ON= "collapseOn";
|
||||
|
||||
public static Collection<String> combine(final Document a, final Config conf) {
|
||||
return new ClusteringCombiner().doCombine(a, conf);
|
||||
}
|
||||
|
@ -20,10 +27,38 @@ public class ClusteringCombiner {
|
|||
final Collection<String> res = Sets.newLinkedHashSet();
|
||||
for (final ClusteringDef cd : conf.clusterings()) {
|
||||
for (final String fieldName : cd.getFields()) {
|
||||
final Field values = a.values(fieldName);
|
||||
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
|
||||
String prefix = getPrefix(cd, fieldName);
|
||||
|
||||
Field values = a.values(fieldName);
|
||||
List<Field> fields = new ArrayList<>();
|
||||
|
||||
if (values instanceof FieldValueImpl) {
|
||||
fields.add(values);
|
||||
}
|
||||
else {
|
||||
fields.addAll((List<Field>) values);
|
||||
}
|
||||
|
||||
res.addAll(
|
||||
cd.clusteringFunction()
|
||||
.apply(conf, fields)
|
||||
.stream()
|
||||
.map(k -> prefix + SEPARATOR +k)
|
||||
.collect(Collectors.toList())
|
||||
);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private String getPrefix(ClusteringDef cd, String fieldName) {
|
||||
return cd.getName()+ SEPARATOR +
|
||||
cd.getParams().keySet()
|
||||
.stream()
|
||||
.filter(k -> k.contains(COLLAPSE_ON))
|
||||
.findFirst()
|
||||
.map(k -> StringUtils.substringAfter(k, SEPARATOR))
|
||||
.orElse(fieldName);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
@ -12,6 +13,8 @@ import java.util.List;
|
|||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
||||
final static String CROSS_COMPARE = "crossCompare";
|
||||
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
|
||||
|
@ -45,7 +48,17 @@ public class TreeNodeDef implements Serializable {
|
|||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
double result;
|
||||
|
||||
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
|
||||
if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
|
||||
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||
double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
|
||||
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
result = Math.max(result1,result2);
|
||||
}
|
||||
else
|
||||
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
|
||||
stats.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
|
|
|
@ -175,5 +175,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,16 +2,24 @@ package eu.dnetlib.pace.config;
|
|||
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||
import eu.dnetlib.pace.tree.support.AggType;
|
||||
import eu.dnetlib.pace.tree.support.FieldConf;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeStats;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -20,13 +28,14 @@ import java.util.stream.Collectors;
|
|||
|
||||
public class ConfigTest extends AbstractPaceTest {
|
||||
|
||||
private Map<String, String> params;
|
||||
private static Map<String, String> params;
|
||||
|
||||
@BeforeAll
|
||||
public void setup() {
|
||||
public static void setup() {
|
||||
params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -102,4 +111,37 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void clusteringCombinerTest() {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||
|
||||
final String json = readFromClasspath("publication.json");
|
||||
|
||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||
|
||||
String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]);
|
||||
|
||||
assertEquals("test", combine[0].split(":")[1]);
|
||||
assertEquals("title", combine[1].split(":")[1]);
|
||||
assertEquals("doi", combine[2].split(":")[1]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void crossCompareTest() {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json"));
|
||||
|
||||
TreeNodeDef treeNode = dedupConf.decisionTree().get("start");
|
||||
|
||||
final String json = readFromClasspath("organization.json");
|
||||
|
||||
final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||
|
||||
TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf);
|
||||
|
||||
assertTrue(nodeStats.getFinalScore(AggType.MAX)>0.7);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"idPath":"$.id",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true",
|
||||
"maxIterations": "20"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"decisionTree" : {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "levenstein",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"crossCompare" : "legalshortname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "false"
|
||||
}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3", "collapseOn:test": ""} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
|
|
Loading…
Reference in New Issue