added comment

2023-11-28 09:00:48 +01:00 · 2023-11-28 09:00:48 +01:00 · 59111713fa
parent 6f4d0c05ea
commit 59111713fa
3 changed files with 21 additions and 84 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -1,17 +1,11 @@

 package eu.dnetlib.dhp.oa.merge;

-import java.io.FileWriter;
-import java.io.IOException;
 import java.text.Normalizer;
 import java.util.*;
-import java.util.function.Function;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.lang3.tuple.MutablePair;
-import org.apache.commons.lang3.tuple.Pair;
 import org.jetbrains.annotations.NotNull;

 import com.wcohen.ss.JaroWinkler;
@ -161,42 +155,8 @@ public class AuthorMerger {
 			.replaceAll("(\\n)+", " ")

 			.trim();
-//        return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
 	}

-//
-//    public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
-//        if (baseAuthor == null || baseAuthor.isEmpty())
-//            return orcidAuthor;
-//
-//        if (orcidAuthor == null || orcidAuthor.isEmpty())
-//            return baseAuthor;
-//
-//        if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
-//            return baseAuthor;
-//
-//
-//        Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
-//            a.addAll(b);
-//            return a;
-//        }));
-//
-//        Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
-//            a.addAll(b);
-//            return a;
-//        }));
-//
-//        System.out.println(pubClusters.keySet().size());
-//        System.out.println(orcidClusters.keySet().size());
-//
-//
-//
-//
-//       return null;
-//
-//
-//    }
-
 	static int hammingDist(String str1, String str2) {
 		if (str1.length() != str2.length())
 			return Math.max(str1.length(), str2.length());
@ -220,7 +180,14 @@ public class AuthorMerger {
 		return null;
 	}

-	public static boolean checkSimilarity2(final Author left, final Author right) {
+	/**
+	 * This method tries to figure out when two author are the same in the contest
+	 * of ORCID enrichment
+	 * @param left Author in the OAF entity
+	 * @param right Author ORCID
+	 * @return based on a heuristic on the names of the authors if they are the same.
+	 */
+	public static boolean checkORCIDSimilarity(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);

@ -267,8 +234,16 @@ public class AuthorMerger {
 		else
 			return false;
 	}
+	//

-	public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
+
+	/**
+	 * Method to enrich ORCID information in one list of authors based on another list
+	 * @param baseAuthor the Author List in the OAF Entity
+	 * @param orcidAuthor The list of ORCID Author intersected
+	 * @return The Author List of the OAF Entity enriched with the orcid Author
+	 */
+	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {

 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;
@ -283,7 +258,7 @@ public class AuthorMerger {
 		oAuthor.addAll(orcidAuthor);

 		baseAuthor.forEach(ba -> {
-			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst();
+			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
 			if (aMatch.isPresent()) {
 				final Author sameAuthor = aMatch.get();
 				addPid(ba, sameAuthor.getPid());
@ -293,40 +268,6 @@ public class AuthorMerger {
 		return baseAuthor;
 	}

-	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
-
-		if (baseAuthor == null || baseAuthor.isEmpty())
-			return orcidAuthor;
-
-		if (orcidAuthor == null || orcidAuthor.isEmpty())
-			return baseAuthor;
-
-		if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
-			return baseAuthor;
-
-		final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
-
-		final List<SimilarityCellInfo> maxColums = new ArrayList<>();
-
-		for (int i = 0; i < orcidAuthor.size(); i++)
-			maxColums.add(new SimilarityCellInfo());
-
-		for (int i = 0; i < baseAuthor.size(); i++) {
-			for (int j = 0; j < orcidAuthor.size(); j++) {
-				similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
-				if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
-					maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
-			}
-		}
-		maxColums
-			.stream()
-			.sorted()
-			.filter(si -> si.maxColumnSimilarity > 0.85)
-			.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
-		return baseAuthor;
-
-	}
-
 	private static void addPid(final Author a, final List<StructuredProperty> pids) {

 		if (a.getPid() == null) {
--- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
@ -4,13 +4,9 @@ package eu.dnetlib.oa.merge;
 import static org.junit.jupiter.api.Assertions.*;

 import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
 import java.io.InputStreamReader;
-import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
-import java.util.stream.Collectors;

 import org.junit.jupiter.api.Test;
 import org.junit.platform.commons.util.StringUtils;
@ -67,7 +63,7 @@ public class AuthorMergerTest {
 				long start = System.currentTimeMillis();

 //                final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
-				final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors);
+				final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);

 				long enrichedAuthorWithPid = enrichedList
 					.stream()
@ -105,7 +101,7 @@ public class AuthorMergerTest {
 		right.setSurname("Anand");
 		right.setFullname("Rachna, Anand");
 //        System.out.println(AuthorMerger.normalize(right.getFullname()));
-		boolean same = AuthorMerger.checkSimilarity2(left, right);
+		boolean same = AuthorMerger.checkORCIDSimilarity(left, right);

 		assertTrue(same);

--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -89,7 +89,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
          p
        }
        case (p: Publication, r: Row) =>
-          p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
+          p.setAuthor(AuthorMerger.enrichOrcid(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
          p
      }
      .write