From 59111713fac09a4508899f8102946bb5f04c7910 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Tue, 28 Nov 2023 09:00:48 +0100
Subject: [PATCH] added comment

---
 .../eu/dnetlib/dhp/oa/merge/AuthorMerger.java | 95 ++++---------------
 .../eu/dnetlib/oa/merge/AuthorMergerTest.java |  8 +-
 .../SparkEnrichGraphWithOrcidAuthors.scala    |  2 +-
 3 files changed, 21 insertions(+), 84 deletions(-)
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
index 852ee163d..a1c3c2cc0 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@@ -1,17 +1,11 @@
 
 package eu.dnetlib.dhp.oa.merge;
 
-import java.io.FileWriter;
-import java.io.IOException;
 import java.text.Normalizer;
 import java.util.*;
-import java.util.function.Function;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;
 
 import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.lang3.tuple.MutablePair;
-import org.apache.commons.lang3.tuple.Pair;
 import org.jetbrains.annotations.NotNull;
 
 import com.wcohen.ss.JaroWinkler;
@@ -161,42 +155,8 @@ public class AuthorMerger {
 			.replaceAll("(\\n)+", " ")
 
 			.trim();
-//        return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
 	}
 
-//
-//    public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
-//        if (baseAuthor == null || baseAuthor.isEmpty())
-//            return orcidAuthor;
-//
-//        if (orcidAuthor == null || orcidAuthor.isEmpty())
-//            return baseAuthor;
-//
-//        if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
-//            return baseAuthor;
-//
-//
-//        Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
-//            a.addAll(b);
-//            return a;
-//        }));
-//
-//        Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
-//            a.addAll(b);
-//            return a;
-//        }));
-//
-//        System.out.println(pubClusters.keySet().size());
-//        System.out.println(orcidClusters.keySet().size());
-//
-//
-//
-//
-//       return null;
-//
-//
-//    }
-
 	static int hammingDist(String str1, String str2) {
 		if (str1.length() != str2.length())
 			return Math.max(str1.length(), str2.length());
@@ -220,7 +180,14 @@ public class AuthorMerger {
 		return null;
 	}
 
-	public static boolean checkSimilarity2(final Author left, final Author right) {
+	/**
+	 * This method tries to figure out when two author are the same in the contest
+	 * of ORCID enrichment
+	 * @param left Author in the OAF entity
+	 * @param right Author ORCID
+	 * @return based on a heuristic on the names of the authors if they are the same.
+	 */
+	public static boolean checkORCIDSimilarity(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);
 
@@ -267,8 +234,16 @@ public class AuthorMerger {
 		else
 			return false;
 	}
+	//
 
-	public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
+
+	/**
+	 * Method to enrich ORCID information in one list of authors based on another list
+	 * @param baseAuthor the Author List in the OAF Entity
+	 * @param orcidAuthor The list of ORCID Author intersected
+	 * @return The Author List of the OAF Entity enriched with the orcid Author
+	 */
+	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
 
 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;
@@ -283,7 +258,7 @@ public class AuthorMerger {
 		oAuthor.addAll(orcidAuthor);
 
 		baseAuthor.forEach(ba -> {
-			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst();
+			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
 			if (aMatch.isPresent()) {
 				final Author sameAuthor = aMatch.get();
 				addPid(ba, sameAuthor.getPid());
@@ -293,40 +268,6 @@ public class AuthorMerger {
 		return baseAuthor;
 	}
 
-	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
-
-		if (baseAuthor == null || baseAuthor.isEmpty())
-			return orcidAuthor;
-
-		if (orcidAuthor == null || orcidAuthor.isEmpty())
-			return baseAuthor;
-
-		if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
-			return baseAuthor;
-
-		final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
-
-		final List<SimilarityCellInfo> maxColums = new ArrayList<>();
-
-		for (int i = 0; i < orcidAuthor.size(); i++)
-			maxColums.add(new SimilarityCellInfo());
-
-		for (int i = 0; i < baseAuthor.size(); i++) {
-			for (int j = 0; j < orcidAuthor.size(); j++) {
-				similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
-				if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
-					maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
-			}
-		}
-		maxColums
-			.stream()
-			.sorted()
-			.filter(si -> si.maxColumnSimilarity > 0.85)
-			.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
-		return baseAuthor;
-
-	}
-
 	private static void addPid(final Author a, final List<StructuredProperty> pids) {
 
 		if (a.getPid() == null) {
diff --git a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
index 9eccab5f1..a11d49b1e 100644
--- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
@@ -4,13 +4,9 @@ package eu.dnetlib.oa.merge;
 import static org.junit.jupiter.api.Assertions.*;
 
 import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
 import java.io.InputStreamReader;
-import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
-import java.util.stream.Collectors;
 
 import org.junit.jupiter.api.Test;
 import org.junit.platform.commons.util.StringUtils;
@@ -67,7 +63,7 @@ public class AuthorMergerTest {
 				long start = System.currentTimeMillis();
 
 //                final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
-				final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors);
+				final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
 
 				long enrichedAuthorWithPid = enrichedList
 					.stream()
@@ -105,7 +101,7 @@ public class AuthorMergerTest {
 		right.setSurname("Anand");
 		right.setFullname("Rachna, Anand");
 //        System.out.println(AuthorMerger.normalize(right.getFullname()));
-		boolean same = AuthorMerger.checkSimilarity2(left, right);
+		boolean same = AuthorMerger.checkORCIDSimilarity(left, right);
 
 		assertTrue(same);
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
index 9b85ba4f2..3c9e04a21 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@@ -89,7 +89,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
           p
         }
         case (p: Publication, r: Row) =>
-          p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
+          p.setAuthor(AuthorMerger.enrichOrcid(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
           p
       }
       .write