bug fix in the authormatch comparator, implementation of tests

2022-01-13 11:58:28 +01:00 · 2022-01-13 11:58:28 +01:00 · e168d95ec0
parent 6eb0730188
commit e168d95ec0
10 changed files with 236 additions and 123 deletions
--- a/dhp-build/dhp-build-properties-maven-plugin/test.properties
+++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties
@ -1,2 +1 @@
-# Mon Sep 13 14:51:29 CEST 2021
-projectPropertyKey=projectPropertyValue
+# Thu Dec 30 13:11:51 CET 2021
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@ -85,7 +85,7 @@ public class Deduper implements Serializable {
    }

    public static JavaRDD<Relation> computeRelations(
-            JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config, boolean useTree) {
+            JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config, boolean useTree, boolean noMatch) {
        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());

        return blocks
@ -93,7 +93,7 @@ public class Deduper implements Serializable {
                        it -> {
                            final SparkReporter reporter = new SparkReporter(accumulators);
                            new BlockProcessorForTesting(config)
-                                    .processSortedBlock(it._1(), it._2().getDocuments(), reporter, useTree);
+                                    .processSortedBlock(it._1(), it._2().getDocuments(), reporter, useTree, noMatch);
                            return reporter.getRelations().iterator();
                        })
                .mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
@ -101,7 +101,7 @@ public class Deduper implements Serializable {
                .map(Tuple2::_2);
    }

-    public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath, boolean useTree){
+    public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath, boolean useTree, boolean noMatch){

        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

@ -117,7 +117,7 @@ public class Deduper implements Serializable {
        JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);

        // create relations by comparing only elements in the same group
-        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf, useTree);
+        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf, useTree, noMatch);

        // save the simrel in the workingdir
        spark
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
@ -83,7 +83,7 @@ public class SparkCreateSimRels extends AbstractSparkJob {
        JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig);

        // create relations by comparing only elements in the same group
-        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConfig, useTree);
+        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConfig, useTree, false);

        // save the simrel in the workingdir
        spark
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
@ -51,6 +51,27 @@
    ],
    "decisionTree": {
      "start": {
+        "fields": [
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid",
+              "mode": "count"
+            }
+          }
+        ],
+        "threshold": 1.0,
+        "aggregation": "MAX",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "instanceTypeCheck",
+        "ignoreUndefined": "false"
+      },
+      "instanceTypeCheck": {
        "fields": [
          {
            "field": "instance",
@ -62,23 +83,13 @@
        ],
        "threshold": 0.5,
        "aggregation": "MAX",
-        "positive": "layer1",
+        "positive": "pidVSaltid",
        "negative": "NO_MATCH",
-        "undefined": "layer1",
+        "undefined": "pidVSaltid",
        "ignoreUndefined": "true"
      },
-      "layer1": {
+      "pidVSaltid": {
        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid"
-            }
-          },
          {
            "field": "pid",
            "comparator": "jsonListMatch",
@ -87,18 +98,19 @@
            "params": {
              "jpath_value": "$.value",
              "jpath_classid": "$.qualifier.classid",
-              "crossCompare": "alternateid"
+              "crossCompare": "alternateid",
+              "mode": "count"
            }
          }
        ],
-        "threshold": 0.5,
+        "threshold": 1.0,
        "aggregation": "MAX",
-        "positive": "layer2",
-        "negative": "layer3",
-        "undefined": "layer3",
+        "positive": "softCheck",
+        "negative": "earlyExits",
+        "undefined": "earlyExits",
        "ignoreUndefined": "true"
      },
-      "layer2": {
+      "softCheck": {
        "fields": [
          {
            "field": "title",
@ -115,7 +127,7 @@
        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
      },
-      "layer3": {
+      "earlyExits": {
        "fields": [
          {
            "field": "title",
@ -134,12 +146,12 @@
        ],
        "threshold": 1.0,
        "aggregation": "AND",
-        "positive": "layer4",
+        "positive": "strongCheck",
        "negative": "NO_MATCH",
-        "undefined": "layer4",
+        "undefined": "strongCheck",
        "ignoreUndefined": "false"
      },
-      "layer4": {
+      "strongCheck": {
        "fields": [
          {
            "field": "title",
@ -151,10 +163,31 @@
        ],
        "threshold": 0.99,
        "aggregation": "AVG",
-        "positive": "MATCH",
+        "positive": "surnames",
        "negative": "NO_MATCH",
        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
+      },
+      "surnames": {
+        "fields": [
+          {
+            "field": "authors",
+            "comparator": "authorsMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "surname_th": 0.75,
+              "fullname_th": 0.75,
+              "mode": "surname"
+            }
+          }
+        ],
+        "threshold": 0.6,
+        "aggregation": "MAX",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "MATCH",
+        "ignoreUndefined": "true"
      }
    },
    "model": [
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.function.Function;
@ -160,6 +162,11 @@ public abstract class AbstractPaceFunctions {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }

+    public String utf8(final String s) {
+        byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
+        return new String(bytes, StandardCharsets.UTF_8);
+    }
+
    public String unicodeNormalization(final String s) {

        Matcher m = hexUnicodePattern.matcher(s);
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -1,13 +1,13 @@
 package eu.dnetlib.pace.tree;

 import com.google.common.collect.Iterables;
-import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.FieldList;
 import eu.dnetlib.pace.model.Person;
 import eu.dnetlib.pace.tree.support.AbstractComparator;
 import eu.dnetlib.pace.tree.support.ComparatorClass;
+import com.wcohen.ss.AbstractStringDistance;

 import java.util.Comparator;
 import java.util.List;
@ -25,6 +25,7 @@ public class AuthorsMatch extends AbstractComparator {
    private double NAME_THRESHOLD;
    private double FULLNAME_THRESHOLD;
    private String MODE; //full or surname
+    private int common;

    public AuthorsMatch(Map<String, String> params){
        super(params, new com.wcohen.ss.JaroWinkler());
@ -34,6 +35,11 @@ public class AuthorsMatch extends AbstractComparator {
        SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
        NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
        FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
+        common = 0;
+    }
+
+    protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
+        super(w, ssalgo);
    }

    @Override
@ -45,41 +51,85 @@ public class AuthorsMatch extends AbstractComparator {
        List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
        List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());

-        int common = 0;
+        common = 0;
+        //compare each element of List1 with each element of List2
        for (Person p1 : aList)
-            for (Person p2 : bList)
+
+            for (Person p2 : bList) {
+
+                //both persons are inaccurate
+                if (!p1.isAccurate() && !p2.isAccurate()) {
+                    //compare just normalized fullnames
+                    if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) {
+                        common += 1;
+                        break;
+                    }
+                }
+
+                //one person is inaccurate
+                if (p1.isAccurate() ^ p2.isAccurate()) {
+                    //prepare data
+                    String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName());
+                    String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname());
+
+                    String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname());
+
+                    if (fullname.contains(surname)) {
                        if (MODE.equals("full")) {
-                    if (personComparator(p1, p2))
+                            if (fullname.contains(name)) {
                                common += 1;
+                                break;
                            }
-                else {
-                    if (surnameComparator(p1, p2))
+                        }
+                        else { //MODE equals "surname"
                            common += 1;
+                            break;
+                        }
+                    }
                }

-        return (double)common / (aList.size() + bList.size() - common);
+                //both persons are accurate
+                if (p1.isAccurate() && p2.isAccurate()) {
+
+                    if (compareSurname(p1, p2)) {
+                        if (MODE.equals("full")) {
+                            if(compareFirstname(p1, p2)) {
+                                common += 1;
+                                break;
                            }
-
-    public boolean personComparator(Person p1, Person p2) {
-
-        if(!p1.isAccurate() || !p2.isAccurate())
-            return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
-
-        if(ssalgo.score(p1.getSurnameString(),p2.getSurnameString()) > SURNAME_THRESHOLD)
-            if(p1.getNameString().length()<=2 || p2.getNameString().length()<=2)
-                return firstLC(p1.getNameString()).equals(firstLC(p2.getNameString()));
-            else
-                return ssalgo.score(p1.getNameString(), p2.getNameString()) > NAME_THRESHOLD;
-        else
-            return false;
                        }
-
-    public boolean surnameComparator(Person p1, Person p2) {
-
-        if(!p1.isAccurate() || !p2.isAccurate())
-            return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
-
-        return ssalgo.score(p1.getSurnameString(), p2.getSurnameString()) > SURNAME_THRESHOLD;
+                        else { //MODE equals "surname"
+                            common += 1;
+                            break;
+                        }
+                    }
+
+                }
+
+            }
+
+        //normalization factor to compute the score
+        int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
+
+        return (double)common / normFactor;
+    }
+
+    public boolean compareSurname(Person p1, Person p2) {
+        return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
+    }
+
+    public boolean compareFirstname(Person p1, Person p2) {
+
+        if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
+            if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
+                return true;
+        }
+
+        return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
+    }
+
+    public String normalization(String s) {
+        return normalize(utf8(cleanup(s)));
    }

 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
@ -36,23 +36,23 @@ public class BlockProcessorForTesting {
            this.dedupConf = dedupConf;
        }

-        public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree)  {
+        public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch)  {
            if (documents.size() > 1) {
 //            log.info("reducing key: '" + key + "' records: " + q.size());
-                process(prepare(documents), context, useTree);
+                process(prepare(documents), context, useTree, noMatch);

            } else {
                context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
            }
        }

-        public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree)  {
+        public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch)  {

            final Queue<MapDocument> q = prepare(documents);

            if (q.size() > 1) {
 //            log.info("reducing key: '" + key + "' records: " + q.size());
-                process(simplifyQueue(q, key, context), context, useTree);
+                process(simplifyQueue(q, key, context), context, useTree, noMatch);

            } else {
                context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
@ -123,7 +123,7 @@ public class BlockProcessorForTesting {
            }
        }

-        private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree)  {
+        private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree, boolean noMatch)  {

            while (!queue.isEmpty()) {

@ -155,18 +155,18 @@ public class BlockProcessorForTesting {

                        if (!idCurr.equals(idPivot) && (fieldCurr != null)) {

-                            if(!compareInstanceType(pivot, curr, dedupConf)){
-                                emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
+                            //draws no match relations (test purpose)
+                            if (noMatch) {
+                                emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
                            }
                            else {
-                                emitOutput(false, idPivot, idCurr, context);
+                                //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
+                                if(useTree)
+                                    emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
+                                else
+                                    emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
                            }

-//                            if(useTree)
-//                                emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
-//                            else
-//                                emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
-
                        }
                    }
                }
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -165,11 +165,6 @@ public class ComparatorTest extends AbstractPaceTest {
 		result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf);
 		System.out.println("result = " + result);

-		final Levenstein levenstein = new Levenstein(params);
-
-		result = levenstein.distance("Victoria", "Windsor", conf);
-		System.out.println("result = " + result);
-
 	}

 	@Test
@ -182,6 +177,14 @@ public class ComparatorTest extends AbstractPaceTest {
 		System.out.println("result = " + result);
 	}

+	@Test
+	public void levensteinTest() {
+		final Levenstein levenstein = new Levenstein(params);
+
+		double result = levenstein.distance("la bruzzo", "la bruzzo", conf);
+		System.out.println("result = " + result);
+	}
+
 	@Test
 	public void instanceTypeMatchTest() {

@ -238,6 +241,11 @@ public class ComparatorTest extends AbstractPaceTest {

 		assertEquals(1.0, result);

+		Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors");
+		result = authorsMatch.compare(a, e, conf);
+
+		assertEquals(0.25, result);
+
 	}

 	@Test
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -1,23 +1,35 @@
 package eu.dnetlib.pace.util;

+import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 import org.junit.jupiter.api.*;

 import java.util.HashMap;
 import java.util.Map;
+import static org.junit.jupiter.api.Assertions.assertEquals;

 public class UtilTest {

-    Map<String, String> params;
+    static Map<String, String> params;

    @BeforeAll
-    public void setUp(){
-        params = new HashMap<String, String>();
+    public static void setUp(){
+        params = new HashMap<>();
    }

    @Test
+    @Ignore
    public void paceResolverTest() {
        PaceResolver paceResolver = new PaceResolver();
        paceResolver.getComparator("keywordMatch", params);
    }

+    @Test
+    public void personTest() {
+        Person p = new Person("j. f. kennedy", false);
+
+        assertEquals("kennedy", p.getSurnameString());
+        assertEquals("j f", p.getNameString());
+    }
+
 }