From eec418cd26d8724887f6f36cf7a98af6a8711c0a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 8 Oct 2020 10:33:55 +0200 Subject: [PATCH] moved AuthoreMerger into dhp-common --- dhp-common/pom.xml | 11 ++ .../eu/dnetlib/dhp/oa/merge/AuthorMerger.java | 168 +++++++++++++++++ .../eu/dnetlib/dhp/oa/dedup/AuthorMerger.java | 170 ------------------ .../dhp/oa/dedup/DedupRecordFactory.java | 3 +- .../dhp/oa/dedup/EntityMergerTest.java | 1 + dhp-workflows/dhp-graph-mapper/pom.xml | 7 - .../eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala | 2 +- 7 files changed, 183 insertions(+), 179 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 1dc3208b5..b295bc1f1 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -92,6 +92,17 @@ com.squareup.okhttp3 okhttp + + + eu.dnetlib + dnet-pace-core + + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java new file mode 100644 index 000000000..bc86a0245 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -0,0 +1,168 @@ +package eu.dnetlib.dhp.oa.merge; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.model.Person; +import scala.Tuple2; + +public class AuthorMerger { + + private static final Double THRESHOLD = 0.95; + + public static List merge(List> authors) { + + authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); + + List author = new ArrayList<>(); + + for (List a : authors) { + author = mergeAuthor(author, a); + } + + return author; + + } + + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); + + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } + + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); + + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + + if (simAuthor.isPresent()) { + double th = THRESHOLD; + // increase the threshold if the surname is too short + if (simAuthor.get()._2().getSurname() != null + && simAuthor.get()._2().getSurname().length() <= 3) + th = 0.99; + + if (simAuthor.get()._1() > th) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } + r.getPid().add(a._1()); + } + } + }); + } + + public static String pidToComparableString(StructuredProperty pid) { + return (pid.getQualifier() != null + ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" + : "") + + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } + + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; + + return (int) authors.stream().filter(AuthorMerger::hasPid).count(); + } + + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } + + private static Double sim(Author a, Author b) { + + final Person pa = parse(a); + final Person pb = parse(b); + + // if both are accurate (e.g. they have name and surname) + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 + + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } + + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } + + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java deleted file mode 100644 index ee5fd5165..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java +++ /dev/null @@ -1,170 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.wcohen.ss.JaroWinkler; - -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.model.Person; -import scala.Tuple2; - -public class AuthorMerger { - - private static final Double THRESHOLD = 0.95; - - public static List merge(List> authors) { - - authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); - - List author = new ArrayList<>(); - - for (List a : authors) { - author = mergeAuthor(author, a); - } - - return author; - - } - - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); - - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } - - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .map(p -> new Tuple2<>(pidToComparableString(p), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); - - pidToEnrich - .forEach( - a -> { - Optional> simAuthor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - - if (simAuthor.isPresent()) { - double th = THRESHOLD; - // increase the threshold if the surname is too short - if (simAuthor.get()._2().getSurname() != null - && simAuthor.get()._2().getSurname().length() <= 3) - th = 0.99; - - if (simAuthor.get()._1() > th) { - Author r = simAuthor.get()._2(); - if (r.getPid() == null) { - r.setPid(new ArrayList<>()); - } - r.getPid().add(a._1()); - } - } - }); - } - - public static String pidToComparableString(StructuredProperty pid) { - return (pid.getQualifier() != null - ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" - : "") - + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); - } - - public static int countAuthorsPids(List authors) { - if (authors == null) - return 0; - - return (int) authors.stream().filter(AuthorMerger::hasPid).count(); - } - - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } - - private static Double sim(Author a, Author b) { - - final Person pa = parse(a); - final Person pb = parse(b); - - // if both are accurate (e.g. they have name and surname) - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 - + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } - - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } - - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 8028d5a94..6a030f376 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,11 +1,12 @@ package eu.dnetlib.dhp.oa.dedup; -import java.io.Serializable; + import java.util.Collection; import java.util.Iterator; import java.util.List; +import eu.dnetlib.dhp.oa.merge.AuthorMerger; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 4fbd7c223..e00f6ac2a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -10,6 +10,7 @@ import java.io.Serializable; import java.nio.file.Paths; import java.util.*; +import eu.dnetlib.dhp.oa.merge.AuthorMerger; import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 38c5c8af7..3e1d84c01 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -83,13 +83,6 @@ dhp-schemas ${project.version} - - - eu.dnetlib.dhp - dhp-dedup-openaire - ${project.version} - - com.jayway.jsonpath json-path diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala index 90d665e0c..ee2dbadfd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala @@ -1,5 +1,5 @@ package eu.dnetlib.dhp.sx.ebi -import eu.dnetlib.dhp.oa.dedup.AuthorMerger +import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import org.apache.spark.sql.{Encoder, Encoders}