implementation of author dedup configuration and lnfi clustering function

2023-01-31 11:53:10 +01:00 · 2023-01-31 11:53:10 +01:00 · 66472ce408
parent 00466512ea
commit 66472ce408
7 changed files with 291 additions and 18 deletions
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
@ -0,0 +1,179 @@
 {
  "wf": {
    "threshold": "0.99",
    "dedupRun": "001",
    "entityType": "author",
    "subEntityType": "author",
    "subEntityValue": "author",
    "orderField": "fullname",
    "queueMaxSize": "200",
    "groupMaxSize": "100",
    "maxChildren": "100",
    "slidingWindowSize": "50",
    "rootBuilder": [
      "result",
      "resultProject_outcome_isProducedBy",
      "resultResult_publicationDataset_isRelatedTo",
      "resultResult_similarity_isAmongTopNSimilarDocuments",
      "resultResult_similarity_hasAmongTopNSimilarDocuments",
      "resultOrganization_affiliation_isAffiliatedWith",
      "resultResult_part_hasPart",
      "resultResult_part_isPartOf",
      "resultResult_supplement_isSupplementTo",
      "resultResult_supplement_isSupplementedBy",
      "resultResult_version_isVersionOf"
    ],
    "includeChildren": "true",
    "maxIterations": 20,
    "idPath": "$.id"
  },
  "pace": {
    "clustering" : [
      { "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
    ],
    "decisionTree": {
      "start": {
        "fields": [
          {
            "field": "pub_id",
            "comparator": "exactMatch",
            "weight": 1,
            "countIfUndefined": "false",
            "params": {}
          }
        ],
        "threshold":1,
        "aggregation": "AVG",
        "positive": "NO_MATCH",
        "negative": "yearCheck",
        "undefined": "yearCheck"
      },
      "yearCheck": {
        "fields": [
          {
            "field": "year",
            "comparator": "numbersComparator",
            "weight": 1,
            "countIfUndefined": "false",
            "params": {}
          }
        ],
        "threshold": 50,
        "aggregation": "MAX",
        "positive": "NO_MATCH",
        "negative": "surnames",
        "undefined": "surnames",
        "ignoreUndefined": "true"
      },
      "surnames": {
        "fields": [
          {
            "field": "coauthors",
            "comparator": "authorsMatch",
            "weight": 1.0,
            "countIfUndefined": "false",
            "params": {
              "surname_th": 0.75,
              "fullname_th": 0.75,
              "size_th": 20,
              "mode": "surname"
            }
          }
        ],
        "threshold": 0.5,
        "aggregation": "MAX",
        "positive": "MATCH",
        "negative": "cityCheck",
        "undefined": "cityCheck",
        "ignoreUndefined": "true"
      },
      "cityCheck": {
        "fields": [
          {
            "field": "org",
            "comparator": "cityMatch",
            "weight": 1.0,
            "countIfUndefined": "true",
            "params": {
              "windowSize": "4"
            }
          }
        ],
        "threshold": 0.1,
        "aggregation": "AVG",
        "positive": "keywordCheck",
        "negative": "NO_MATCH",
        "undefined": "keywordCheck",
        "ignoreUndefined": "true"
      },
      "keywordCheck": {
        "fields": [
          {
            "field": "org",
            "comparator": "keywordMatch",
            "weight": 1.0,
            "countIfUndefined": "true",
            "params": {
              "windowSize": "4"
            }
          }
        ],
        "threshold": 0.5,
        "aggregation": "AVG",
        "positive": "orgCheck",
        "negative": "NO_MATCH",
        "undefined": "orgCheck",
        "ignoreUndefined": "true"
      },
      "orgCheck": {
        "fields": [
          {
            "field": "org",
            "comparator": "jaroWinklerNormalizedName",
            "weight": 1,
            "countIfUndefined": "true",
            "params": {
              "windowSize": "4"
            }
          }
        ],
        "threshold": 0.7,
        "aggregation": "AVG",
        "positive": "MATCH",
        "negative": "NO_MATCH",
        "undefined": "MATCH",
        "ignoreUndefined": "true"
      }
    },
    "model": [
      {
        "name": "name",
        "type": "String",
        "path": "$.name"
      },
      {
        "name": "coauthors",
        "type": "List",
        "path": "$.coauthors[*].name",
        "size": 200
      },
      {
        "name": "year",
        "type": "String",
        "path": "$.year"
      },
      {
        "name": "pub_id",
        "type": "String",
        "path": "$.pub_id"
      },
      {
        "name": "org",
        "type": "String",
        "path": "$.org"
      }
    ],
    "blacklists": {},
    "synonyms": {}
  }
 }
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
@ -29,26 +29,24 @@
  },
  "pace": {
    "clustering" : [
-      { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
+      { "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
      { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
    ],
    "decisionTree": {
      "start": {
        "fields": [
          {
-            "field": "year",
+            "field": "pub_id",
-            "comparator": "numbersComparator",
+            "comparator": "exactMatch",
            "weight": 1,
            "countIfUndefined": "false",
            "params": {}
          }
        ],
-        "threshold": 50,
+        "threshold":1,
-        "aggregation": "MAX",
+        "aggregation": "AVG",
        "positive": "NO_MATCH",
-        "negative": "surnames",
+        "negative": "yearCheck",
-        "undefined": "surnames",
+        "undefined": "yearCheck"
        "ignoreUndefined": "true"
      },
      "surnames": {
        "fields": [
@ -65,7 +63,7 @@
            }
          }
        ],
-        "threshold": 0.6,
+        "threshold": 0.5,
        "aggregation": "MAX",
        "positive": "MATCH",
        "negative": "NO_MATCH",
@ -75,7 +73,7 @@
    },
    "model": [
      {
-        "name": "fullname",
+        "name": "name",
        "type": "String",
        "path": "$.name"
      },
@ -88,12 +86,17 @@
      {
        "name": "year",
        "type": "String",
-        "path": "$.publication.year"
+        "path": "$.year"
      },
      {
-        "name": "title",
+        "name": "pub_id",
        "type": "String",
-        "path": "$.publication.title"
+        "path": "$.pub_id"
      },
      {
        "name": "org",
        "type": "String",
        "path": "$.org"
      }
    ],
    "blacklists": {},
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@ -0,0 +1,77 @@
 package eu.dnetlib.pace.clustering;
 import com.google.common.collect.Lists;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.Person;
 import org.apache.commons.lang3.StringUtils;
 import java.util.*;
 import java.util.stream.Collectors;
@ClusteringClass("lnfi")
 public class LastNameFirstInitial extends AbstractClusteringFunction{
    private boolean DEFAULT_AGGRESSIVE = true;
    public LastNameFirstInitial(final Map<String, Integer> params) {
        super(params);
    }
    @Override
    public Collection<String> apply(Config conf, List<Field> fields) {
        return fields.stream().filter(f -> !f.isEmpty())
                .map(Field::stringValue)
                .map(this::normalize)
                .map(s -> doApply(conf, s))
                .map(c -> filterBlacklisted(c, ngramBlacklist))
                .flatMap(c -> c.stream())
                .filter(StringUtils::isNotBlank)
                .collect(Collectors.toCollection(HashSet::new));
    }
    @Override
    protected String normalize(final String s) {
        return fixAliases(transliterate(nfd(unicodeNormalization(s))))
                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
                .replaceAll("[^ \\w]+", "")
                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
                .replaceAll("(\\p{Punct})+", " ")
                .replaceAll("(\\d)+", " ")
                .replaceAll("(\\n)+", " ")
                .trim();
    }
    @Override
    protected Collection<String> doApply(final Config conf, final String s) {
        final List<String> res = Lists.newArrayList();
        final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
        Person p = new Person(s, aggressive);
        if (p.isAccurate()) {
            String lastName = p.getNormalisedSurname().toLowerCase();
            String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
            res.add(firstInitial.concat(lastName));
        }
        else {  // is not accurate, meaning it has no defined name and surname
            List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
            if (fullname.size() == 1) {
                res.add(p.getNormalisedFullname().toLowerCase());
            }
            else if (fullname.size() == 2) {
                res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
                res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
            }
            else {
                res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
                res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
            }
        }
        return res;
    }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -43,7 +43,7 @@ public class Person {
 			// s = s.replaceAll("[\\W&&[^,-]]", "");
 		}
-		if (s.contains(",")) {
+		if (s.contains(",")) {	//if the name contains a comma it is easy derivable the name and the surname
 			final String[] arr = s.split(",");
 			if (arr.length == 1) {
 				fullname = splitTerms(arr[0]);
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 	}
 	@Test
 	public void testLastNameFirstInitial(){
 		final ClusteringFunction cf = new LastNameFirstInitial(params);
 		final String s = "LI Yonghong";
 		System.out.println("s = " + s);
 		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
 	}
 }
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -30,6 +30,11 @@ public class UtilTest {
        assertEquals("kennedy", p.getSurnameString());
        assertEquals("j f", p.getNameString());
        p = new Person("Guan-Hua Du", false);
        System.out.println("surname = " + p.getSurnameString());
        System.out.println("name = " + p.getNameString());
    }
 }