From cb595c87bbada77a38a8e1851e361b1e1f88ddaa Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 17 Apr 2023 11:06:27 +0200 Subject: [PATCH] implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser --- .../java/eu/dnetlib/pace/config/Type.java | 2 +- .../eu/dnetlib/pace/model/FieldValue.java | 2 + .../eu/dnetlib/pace/model/FieldValueImpl.java | 10 +- .../eu/dnetlib/pace/tree/AuthorsMatch.java | 2 +- .../dnetlib/pace/tree/CosineSimilarity.java | 53 ++++++++++ .../eu/dnetlib/pace/tree/StringListMatch.java | 11 ++- .../pace/util/BlockProcessorForTesting.java | 1 - .../eu/dnetlib/pace/util/MapDocumentUtil.java | 38 +++++++- .../eu/dnetlib/pace/AbstractPaceTest.java | 4 + .../pace/comparators/ComparatorTest.java | 15 +++ .../eu/dnetlib/pace/config/ConfigTest.java | 12 +-- ...r.test.conf.json => author.fdup.conf.json} | 96 ++++++++++++------- .../eu/dnetlib/pace/config/author.json | 2 +- 13 files changed, 197 insertions(+), 51 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java rename dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/{author.test.conf.json => author.fdup.conf.json} (50%) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java index 33ae4015f..20981c427 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java @@ -1,5 +1,5 @@ package eu.dnetlib.pace.config; public enum Type { - String, Int, List, JSON, URL, StringConcat + String, Int, List, JSON, URL, StringConcat, DoubleArray } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java index 861332729..ebe474363 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java @@ -20,4 +20,6 @@ public interface FieldValue extends Field { */ public void setValue(final Object value); + public double[] doubleArrayValue(); + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java index 0a72c07c6..a235315d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java @@ -58,8 +58,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue { throw new RuntimeException(value.toString()); } case URL: - String str = value.toString(); - return StringUtils.isBlank(str) || !isValidURL(str); + String str = value.toString(); + return StringUtils.isBlank(str) || !isValidURL(str); + case DoubleArray: + return doubleArrayValue().length==0; default: return true; } @@ -116,6 +118,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue { // } } + public double[] doubleArrayValue() { + return (double[])getValue(); + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index c1b536ecd..33f86d85d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -52,7 +52,7 @@ public class AuthorsMatch extends AbstractComparator { if (a.isEmpty() || b.isEmpty()) return -1; - if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) a).size() > SIZE_THRESHOLD) + if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD) return 1.0; List aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java new file mode 100644 index 000000000..5d441771e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java @@ -0,0 +1,53 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.Person; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@ComparatorClass("cosineSimilarity") +public class CosineSimilarity extends AbstractComparator { + + Map params; + + public CosineSimilarity(Map params) { + super(params); + } + + @Override + public double compare(final Field a, final Field b, final Config conf) { + + if (a.isEmpty() || b.isEmpty()) + return -1; + + double[] aVector = ((FieldValueImpl) a).doubleArrayValue(); + double[] bVector = ((FieldValueImpl) b).doubleArrayValue(); + + return cosineSimilarity(aVector, bVector); + } + + double cosineSimilarity(double[] a, double[] b) { + double dotProduct = 0; + double normASum = 0; + double normBSum = 0; + + for(int i = 0; i < a.length; i ++) { + dotProduct += a[i] * b[i]; + normASum += a[i] * a[i]; + normBSum += b[i] * b[i]; + } + + double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum); + return dotProduct / eucledianDist; + } + + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java index 3ed98a04c..e67a7ea0b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -19,9 +19,13 @@ public class StringListMatch extends AbstractComparator { private static final Log log = LogFactory.getLog(StringListMatch.class); private Map params; + final private String TYPE; //percentage or count + public StringListMatch(final Map params) { super(params); this.params = params; + + TYPE = params.getOrDefault("type", "percentage"); } @Override @@ -31,7 +35,7 @@ public class StringListMatch extends AbstractComparator { final Set pb = new HashSet<>(((FieldList) b).stringList()); if (pa.isEmpty() || pb.isEmpty()) { - return -1; //return undefined if one of the two lists of pids is empty + return -1; //return undefined if one of the two lists is empty } int incommon = Sets.intersection(pa, pb).size(); @@ -41,7 +45,10 @@ public class StringListMatch extends AbstractComparator { return 0.0; } - return (double)incommon / (incommon + simDiff); + if(TYPE.equals("percentage")) + return (double)incommon / (incommon + simDiff); + else + return incommon; } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java index 174c5c17e..24264c0bf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -244,6 +244,5 @@ public class BlockProcessorForTesting { final String type = dedupConf.getWf().getEntityType(); context.emit(type, from, to); - context.emit(type, to, from); } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index f9bd9399b..cc801068b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -7,12 +7,10 @@ import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.Option; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.*; import net.minidev.json.JSONArray; +import java.math.BigDecimal; import java.util.*; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -46,6 +44,14 @@ public class MapDocumentUtil { .forEach(fi::add); stringField.put(fdef.getName(), fi); break; + case DoubleArray: + stringField.put( + fdef.getName(), + new FieldValueImpl(Type.DoubleArray, + fdef.getName(), + getJPathArray(fdef.getPath(), json)) + ); + break; case StringConcat: String[] jpaths = fdef.getPath().split("\\|\\|\\|"); stringField.put( @@ -115,6 +121,30 @@ public class MapDocumentUtil { } } + public static double[] getJPathArray(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof double[]) + return (double[]) o; + if (o instanceof JSONArray) { + Object[] objects = ((JSONArray) o).toArray(); + double[] array = new double[objects.length]; + for (int i = 0; i < objects.length; i++) { + if (objects[i] instanceof BigDecimal) + array[i] = ((BigDecimal)objects[i]).doubleValue(); + else + array[i] = (double) objects[i]; + } + return array; + } + return new double[0]; + } + catch (Exception e) { + e.printStackTrace(); + return new double[0]; + } + } + public static String truncateValue(String value, int length) { if (value == null) diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index 14e1e8d0d..b98fd989b 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -36,6 +36,10 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions { return new FieldValueImpl(Type.URL, "url", s); } + protected Field array(final double[] a) { + return new FieldValueImpl(Type.DoubleArray, "array", a); + } + protected Field createFieldList(List strings, String fieldName){ List fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList()); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index b79305b92..b19d77e5c 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -2,7 +2,9 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.config.DedupConfig; @@ -284,5 +286,18 @@ public class ComparatorTest extends AbstractPaceTest { } + @Test + public void cosineSimilarity() { + + CosineSimilarity cosineSimilarity = new CosineSimilarity(params); + + Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3}); + Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3}); + + double compare = cosineSimilarity.compare(a, b, conf); + + System.out.println("compare = " + compare); + } + } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 56d8530be..4a2a062a1 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -7,6 +7,7 @@ import eu.dnetlib.pace.clustering.ClusteringClass; import eu.dnetlib.pace.clustering.ClusteringCombiner; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldValue; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.JsonListMatch; import eu.dnetlib.pace.tree.support.AggType; @@ -20,10 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; @@ -104,15 +102,15 @@ public class ConfigTest extends AbstractPaceTest { } @Test - public void asMapDocumentTest2() { + public void authorAsMapDocument() { - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json")); + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json")); final String json = readFromClasspath("author.json"); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue()); + System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue())); } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json similarity index 50% rename from dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json rename to dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json index 0ac29f875..c7d158ce6 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json @@ -29,71 +29,103 @@ }, "pace": { "clustering" : [ - { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} }, - { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} } + { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} } ], "decisionTree": { "start": { "fields": [ { - "field": "year", - "comparator": "numbersComparator", - "weight": 1, - "countIfUndefined": "false", + "field": "orcid", + "comparator": "exactMatch", + "weight": 1.0, + "countIfUndefined": "true", "params": {} } ], - "threshold": 50, + "threshold": 1.0, "aggregation": "MAX", - "positive": "NO_MATCH", - "negative": "surnames", - "undefined": "surnames", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "orcids", "ignoreUndefined": "true" }, - "surnames": { + "orcids": { + "fields": [ + { + "field": "orcids", + "comparator": "stringListMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": {"type": "count"} + } + ], + "threshold": 3.0, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "coauthors", + "undefined": "coauthors", + "ignoreUndefined": "true" + }, + "coauthors": { "fields": [ { "field": "coauthors", "comparator": "authorsMatch", "weight": 1.0, - "countIfUndefined": "false", - "params": { - "surname_th": 0.75, - "fullname_th": 0.75, - "size_th": 20, - "mode": "surname" - } + "countIfUndefined": "true", + "params": {"type": "count"} } ], - "threshold": 0.6, + "threshold": 1.0, + "aggregation": "MAX", + "positive": "topicsMatch", + "negative": "NO_MATCH", + "undefined": "topicsMatch", + "ignoreUndefined": "true" + }, + "topicsMatch": { + "fields": [ + { + "field": "topics", + "comparator": "cosineSimilarity", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 1.0, "aggregation": "MAX", "positive": "MATCH", "negative": "NO_MATCH", - "undefined": "MATCH", - "ignoreUndefined": "true" + "undefined": "NO_MATCH", + "ignoreUndefined": "false" } }, "model": [ + { + "name": "topics", + "type": "DoubleArray", + "path": "$.topics" + }, { "name": "fullname", "type": "String", - "path": "$.name" + "path": "$.fullname" + }, + { + "name": "orcid", + "type": "String", + "path": "$.orcid" }, { "name": "coauthors", "type": "List", - "path": "$.coauthors[*].name", - "size": 200 + "path": "$.coAuthors[*].fullname" }, { - "name": "year", - "type": "String", - "path": "$.publication.year" - }, - { - "name": "title", - "type": "String", - "path": "$.publication.title" + "name": "orcids", + "type": "List", + "path": "$.coAuthors[*].orcid" } ], "blacklists": {}, diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json index 62c6e9185..f867afda1 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json @@ -1 +1 @@ -{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}} \ No newline at end of file +{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"} \ No newline at end of file