From 6a6c266dde6f9097c82696c8011677955ac8e479 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 31 Jan 2023 11:53:10 +0100 Subject: [PATCH] implementation of author dedup configuration and lnfi clustering function --- .../pace/clustering/LastNameFirstInitial.java | 77 +++++++++++++++++++ .../java/eu/dnetlib/pace/model/Person.java | 2 +- .../clustering/ClusteringFunctionTest.java | 9 +++ .../java/eu/dnetlib/pace/util/UtilTest.java | 5 ++ 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java new file mode 100644 index 000000000..7f86854c2 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -0,0 +1,77 @@ +package eu.dnetlib.pace.clustering; + +import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.Person; +import org.apache.commons.lang3.StringUtils; + +import java.util.*; +import java.util.stream.Collectors; + +@ClusteringClass("lnfi") +public class LastNameFirstInitial extends AbstractClusteringFunction{ + + private boolean DEFAULT_AGGRESSIVE = true; + + public LastNameFirstInitial(final Map params) { + super(params); + } + + @Override + public Collection apply(Config conf, List fields) { + return fields.stream().filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::normalize) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } + + @Override + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + @Override + protected Collection doApply(final Config conf, final String s) { + + final List res = Lists.newArrayList(); + + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + + Person p = new Person(s, aggressive); + + if (p.isAccurate()) { + String lastName = p.getNormalisedSurname().toLowerCase(); + String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1); + + res.add(firstInitial.concat(lastName)); + } + else { // is not accurate, meaning it has no defined name and surname + List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); + if (fullname.size() == 1) { + res.add(p.getNormalisedFullname().toLowerCase()); + } + else if (fullname.size() == 2) { + res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase()); + res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase()); + } + else { + res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase()); + res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase()); + } + } + + return res; + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java index ec3340672..543b1bdfe 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java @@ -43,7 +43,7 @@ public class Person { // s = s.replaceAll("[\\W&&[^,-]]", ""); } - if (s.contains(",")) { + if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname final String[] arr = s.split(","); if (arr.length == 1) { fullname = splitTerms(arr[0]); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 7a1d389bb..f57daaa32 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } + @Test + public void testLastNameFirstInitial(){ + + final ClusteringFunction cf = new LastNameFirstInitial(params); + final String s = "LI Yonghong"; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + } + } \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 601831e67..1e6053246 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -30,6 +30,11 @@ public class UtilTest { assertEquals("kennedy", p.getSurnameString()); assertEquals("j f", p.getNameString()); + + p = new Person("Guan-Hua Du", false); + + System.out.println("surname = " + p.getSurnameString()); + System.out.println("name = " + p.getNameString()); } }