diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index 852ee163d..a1c3c2cc0 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -1,17 +1,11 @@ package eu.dnetlib.dhp.oa.merge; -import java.io.FileWriter; -import java.io.IOException; import java.text.Normalizer; import java.util.*; -import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.MutablePair; -import org.apache.commons.lang3.tuple.Pair; import org.jetbrains.annotations.NotNull; import com.wcohen.ss.JaroWinkler; @@ -161,42 +155,8 @@ public class AuthorMerger { .replaceAll("(\\n)+", " ") .trim(); -// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining()); } -// -// public static List enrichOrcid2(List baseAuthor, List orcidAuthor) { -// if (baseAuthor == null || baseAuthor.isEmpty()) -// return orcidAuthor; -// -// if (orcidAuthor == null || orcidAuthor.isEmpty()) -// return baseAuthor; -// -// if (baseAuthor.size() == 1 && orcidAuthor.size() > 10) -// return baseAuthor; -// -// -// Map> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> { -// a.addAll(b); -// return a; -// })); -// -// Map> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> { -// a.addAll(b); -// return a; -// })); -// -// System.out.println(pubClusters.keySet().size()); -// System.out.println(orcidClusters.keySet().size()); -// -// -// -// -// return null; -// -// -// } - static int hammingDist(String str1, String str2) { if (str1.length() != str2.length()) return Math.max(str1.length(), str2.length()); @@ -220,7 +180,14 @@ public class AuthorMerger { return null; } - public static boolean checkSimilarity2(final Author left, final Author right) { + /** + * This method tries to figure out when two author are the same in the contest + * of ORCID enrichment + * @param left Author in the OAF entity + * @param right Author ORCID + * @return based on a heuristic on the names of the authors if they are the same. + */ + public static boolean checkORCIDSimilarity(final Author left, final Author right) { final Person pl = parse(left); final Person pr = parse(right); @@ -267,8 +234,16 @@ public class AuthorMerger { else return false; } + // - public static List enrichOrcid2(List baseAuthor, List orcidAuthor) { + + /** + * Method to enrich ORCID information in one list of authors based on another list + * @param baseAuthor the Author List in the OAF Entity + * @param orcidAuthor The list of ORCID Author intersected + * @return The Author List of the OAF Entity enriched with the orcid Author + */ + public static List enrichOrcid(List baseAuthor, List orcidAuthor) { if (baseAuthor == null || baseAuthor.isEmpty()) return orcidAuthor; @@ -283,7 +258,7 @@ public class AuthorMerger { oAuthor.addAll(orcidAuthor); baseAuthor.forEach(ba -> { - Optional aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst(); + Optional aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst(); if (aMatch.isPresent()) { final Author sameAuthor = aMatch.get(); addPid(ba, sameAuthor.getPid()); @@ -293,40 +268,6 @@ public class AuthorMerger { return baseAuthor; } - public static List enrichOrcid(List baseAuthor, List orcidAuthor) { - - if (baseAuthor == null || baseAuthor.isEmpty()) - return orcidAuthor; - - if (orcidAuthor == null || orcidAuthor.isEmpty()) - return baseAuthor; - - if (baseAuthor.size() == 1 && orcidAuthor.size() > 10) - return baseAuthor; - - final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()]; - - final List maxColums = new ArrayList<>(); - - for (int i = 0; i < orcidAuthor.size(); i++) - maxColums.add(new SimilarityCellInfo()); - - for (int i = 0; i < baseAuthor.size(); i++) { - for (int j = 0; j < orcidAuthor.size(); j++) { - similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j)); - if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j]) - maxColums.get(j).setValues(i, j, similarityMatrix[i][j]); - } - } - maxColums - .stream() - .sorted() - .filter(si -> si.maxColumnSimilarity > 0.85) - .forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid())); - return baseAuthor; - - } - private static void addPid(final Author a, final List pids) { if (a.getPid() == null) { diff --git a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java index 9eccab5f1..a11d49b1e 100644 --- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java @@ -4,13 +4,9 @@ package eu.dnetlib.oa.merge; import static org.junit.jupiter.api.Assertions.*; import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; import java.io.InputStreamReader; -import java.util.Arrays; import java.util.List; import java.util.Objects; -import java.util.stream.Collectors; import org.junit.jupiter.api.Test; import org.junit.platform.commons.util.StringUtils; @@ -67,7 +63,7 @@ public class AuthorMergerTest { long start = System.currentTimeMillis(); // final List enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors); - final List enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors); + final List enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors); long enrichedAuthorWithPid = enrichedList .stream() @@ -105,7 +101,7 @@ public class AuthorMergerTest { right.setSurname("Anand"); right.setFullname("Rachna, Anand"); // System.out.println(AuthorMerger.normalize(right.getFullname())); - boolean same = AuthorMerger.checkSimilarity2(left, right); + boolean same = AuthorMerger.checkORCIDSimilarity(left, right); assertTrue(same); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala index 9b85ba4f2..3c9e04a21 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala @@ -89,7 +89,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] p } case (p: Publication, r: Row) => - p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r))) + p.setAuthor(AuthorMerger.enrichOrcid(p.getAuthor, AuthorEnricher.toOAFAuthor(r))) p } .write