implementation of author dedup configuration and lnfi clustering function

2023-01-31 11:53:10 +01:00 · 2023-01-31 11:53:10 +01:00 · 66472ce408
parent 00466512ea
commit 66472ce408
7 changed files with 291 additions and 18 deletions
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
@ -0,0 +1,179 @@
+{
+  "wf": {
+    "threshold": "0.99",
+    "dedupRun": "001",
+    "entityType": "author",
+    "subEntityType": "author",
+    "subEntityValue": "author",
+    "orderField": "fullname",
+    "queueMaxSize": "200",
+    "groupMaxSize": "100",
+    "maxChildren": "100",
+    "slidingWindowSize": "50",
+    "rootBuilder": [
+      "result",
+      "resultProject_outcome_isProducedBy",
+      "resultResult_publicationDataset_isRelatedTo",
+      "resultResult_similarity_isAmongTopNSimilarDocuments",
+      "resultResult_similarity_hasAmongTopNSimilarDocuments",
+      "resultOrganization_affiliation_isAffiliatedWith",
+      "resultResult_part_hasPart",
+      "resultResult_part_isPartOf",
+      "resultResult_supplement_isSupplementTo",
+      "resultResult_supplement_isSupplementedBy",
+      "resultResult_version_isVersionOf"
+    ],
+    "includeChildren": "true",
+    "maxIterations": 20,
+    "idPath": "$.id"
+  },
+  "pace": {
+    "clustering" : [
+      { "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
+    ],
+    "decisionTree": {
+      "start": {
+        "fields": [
+          {
+            "field": "pub_id",
+            "comparator": "exactMatch",
+            "weight": 1,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold":1,
+        "aggregation": "AVG",
+        "positive": "NO_MATCH",
+        "negative": "yearCheck",
+        "undefined": "yearCheck"
+      },
+      "yearCheck": {
+        "fields": [
+          {
+            "field": "year",
+            "comparator": "numbersComparator",
+            "weight": 1,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 50,
+        "aggregation": "MAX",
+        "positive": "NO_MATCH",
+        "negative": "surnames",
+        "undefined": "surnames",
+        "ignoreUndefined": "true"
+      },
+      "surnames": {
+        "fields": [
+          {
+            "field": "coauthors",
+            "comparator": "authorsMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "surname_th": 0.75,
+              "fullname_th": 0.75,
+              "size_th": 20,
+              "mode": "surname"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "MAX",
+        "positive": "MATCH",
+        "negative": "cityCheck",
+        "undefined": "cityCheck",
+        "ignoreUndefined": "true"
+      },
+      "cityCheck": {
+        "fields": [
+          {
+            "field": "org",
+            "comparator": "cityMatch",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          }
+        ],
+        "threshold": 0.1,
+        "aggregation": "AVG",
+        "positive": "keywordCheck",
+        "negative": "NO_MATCH",
+        "undefined": "keywordCheck",
+        "ignoreUndefined": "true"
+      },
+      "keywordCheck": {
+        "fields": [
+          {
+            "field": "org",
+            "comparator": "keywordMatch",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "AVG",
+        "positive": "orgCheck",
+        "negative": "NO_MATCH",
+        "undefined": "orgCheck",
+        "ignoreUndefined": "true"
+      },
+      "orgCheck": {
+        "fields": [
+          {
+            "field": "org",
+            "comparator": "jaroWinklerNormalizedName",
+            "weight": 1,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          }
+        ],
+        "threshold": 0.7,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "MATCH",
+        "ignoreUndefined": "true"
+      }
+    },
+    "model": [
+      {
+        "name": "name",
+        "type": "String",
+        "path": "$.name"
+      },
+      {
+        "name": "coauthors",
+        "type": "List",
+        "path": "$.coauthors[*].name",
+        "size": 200
+      },
+      {
+        "name": "year",
+        "type": "String",
+        "path": "$.year"
+      },
+      {
+        "name": "pub_id",
+        "type": "String",
+        "path": "$.pub_id"
+      },
+      {
+        "name": "org",
+        "type": "String",
+        "path": "$.org"
+      }
+    ],
+    "blacklists": {},
+    "synonyms": {}
+  }
+}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
@ -29,26 +29,24 @@
  },
  "pace": {
    "clustering" : [
-      { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
-      { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
+      { "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
    ],
    "decisionTree": {
      "start": {
        "fields": [
          {
-            "field": "year",
-            "comparator": "numbersComparator",
+            "field": "pub_id",
+            "comparator": "exactMatch",
            "weight": 1,
            "countIfUndefined": "false",
            "params": {}
          }
        ],
-        "threshold": 50,
-        "aggregation": "MAX",
+        "threshold":1,
+        "aggregation": "AVG",
        "positive": "NO_MATCH",
-        "negative": "surnames",
-        "undefined": "surnames",
-        "ignoreUndefined": "true"
+        "negative": "yearCheck",
+        "undefined": "yearCheck"
      },
      "surnames": {
        "fields": [
@ -65,7 +63,7 @@
            }
          }
        ],
-        "threshold": 0.6,
+        "threshold": 0.5,
        "aggregation": "MAX",
        "positive": "MATCH",
        "negative": "NO_MATCH",
@ -75,7 +73,7 @@
    },
    "model": [
      {
-        "name": "fullname",
+        "name": "name",
        "type": "String",
        "path": "$.name"
      },
@ -88,12 +86,17 @@
      {
        "name": "year",
        "type": "String",
-        "path": "$.publication.year"
+        "path": "$.year"
      },
      {
-        "name": "title",
+        "name": "pub_id",
        "type": "String",
-        "path": "$.publication.title"
+        "path": "$.pub_id"
+      },
+      {
+        "name": "org",
+        "type": "String",
+        "path": "$.org"
      }
    ],
    "blacklists": {},
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@ -0,0 +1,77 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("lnfi")
+public class LastNameFirstInitial extends AbstractClusteringFunction{
+
+    private boolean DEFAULT_AGGRESSIVE = true;
+
+    public LastNameFirstInitial(final Map<String, Integer> params) {
+        super(params);
+    }
+
+    @Override
+    public Collection<String> apply(Config conf, List<Field> fields) {
+        return fields.stream().filter(f -> !f.isEmpty())
+                .map(Field::stringValue)
+                .map(this::normalize)
+                .map(s -> doApply(conf, s))
+                .map(c -> filterBlacklisted(c, ngramBlacklist))
+                .flatMap(c -> c.stream())
+                .filter(StringUtils::isNotBlank)
+                .collect(Collectors.toCollection(HashSet::new));
+    }
+
+    @Override
+    protected String normalize(final String s) {
+        return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+                .replaceAll("[^ \\w]+", "")
+                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+                .replaceAll("(\\p{Punct})+", " ")
+                .replaceAll("(\\d)+", " ")
+                .replaceAll("(\\n)+", " ")
+                .trim();
+    }
+
+    @Override
+    protected Collection<String> doApply(final Config conf, final String s) {
+
+        final List<String> res = Lists.newArrayList();
+
+        final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+        Person p = new Person(s, aggressive);
+
+        if (p.isAccurate()) {
+            String lastName = p.getNormalisedSurname().toLowerCase();
+            String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
+
+            res.add(firstInitial.concat(lastName));
+        }
+        else {  // is not accurate, meaning it has no defined name and surname
+            List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
+            if (fullname.size() == 1) {
+                res.add(p.getNormalisedFullname().toLowerCase());
+            }
+            else if (fullname.size() == 2) {
+                res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
+                res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+            }
+            else {
+                res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
+                res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+            }
+        }
+
+        return res;
+    }
+}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -43,7 +43,7 @@ public class Person {
 			// s = s.replaceAll("[\\W&&[^,-]]", "");
 		}

-		if (s.contains(",")) {
+		if (s.contains(",")) {	//if the name contains a comma it is easy derivable the name and the surname
 			final String[] arr = s.split(",");
 			if (arr.length == 1) {
 				fullname = splitTerms(arr[0]);
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {

 	}

+	@Test
+	public void testLastNameFirstInitial(){
+
+		final ClusteringFunction cf = new LastNameFirstInitial(params);
+		final String s = "LI Yonghong";
+		System.out.println("s = " + s);
+		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
+	}
+
 }
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -30,6 +30,11 @@ public class UtilTest {

        assertEquals("kennedy", p.getSurnameString());
        assertEquals("j f", p.getNameString());
+
+        p = new Person("Guan-Hua Du", false);
+
+        System.out.println("surname = " + p.getSurnameString());
+        System.out.println("name = " + p.getNameString());
    }

 }