Implemented Author MErger for ORCID that takes in account the case when name and surname are swapped

2023-11-28 08:43:56 +01:00 · 2023-11-28 08:43:56 +01:00 · 6f4d0c05ea
parent 34a4b3cbdf
commit 6f4d0c05ea
5 changed files with 51 additions and 116 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -159,17 +159,11 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
+
 			.trim();
 //        return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
 	}

-	private static String generateAuthorkey(final Author a) {
-		if (a.getSurname() == null)
-			return "NOSURNAME";
-
-		return normalize(a.getSurname());
-	}
-
 //
 //    public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
 //        if (baseAuthor == null || baseAuthor.isEmpty())
@ -226,54 +220,26 @@ public class AuthorMerger {
 		return null;
 	}

-	public static boolean checkSimilarity3(final Author left, final Author right) {
-
-		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
-			&&
-			StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
-
-		)
-			return left.getSurname().equalsIgnoreCase(right.getSurname())
-				&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
-
-		final Person pl = parse(left);
-		final Person pr = parse(right);
-
-		// If one of them didn't have a surname the match is false
-		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
-			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
-			return false;
-
-		// The Authors have one surname in common
-		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
-
-			// If one of them has only a surname and is the same we can say that they are the same author
-			if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
-				(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
-				return true;
-			// The authors have the same initials of Name in common
-			if (pl
-				.getName()
-				.stream()
-				.anyMatch(
-					nl -> pr
-						.getName()
-						.stream()
-						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
-				return true;
-		}
-		return false;
-	}
-
 	public static boolean checkSimilarity2(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);

-		// If one of them didn't have a surname the match is false
+		// If one of them didn't have a surname we verify if they have the fullName not empty
+		// and verify if the normalized version is equal
 		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
-			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
-			return false;
+			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {

+			if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
+				&& !pr.getFullname().isEmpty()) {
+				return pl
+					.getFullname()
+					.stream()
+					.anyMatch(
+						fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
+			} else {
+				return false;
+			}
+		}
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {

@ -292,56 +258,18 @@ public class AuthorMerger {
 						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
 				return true;
 		}
-		return false;
-	}

-	public static boolean checkSimilarity(final Author left, final Author right) {
-
-		if (left.getSurname() == null && left.getFullname() == null)
-			return false;
-		if (right.getSurname() == null && right.getFullname() == null)
-			return false;
-
-		// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
-		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
-			if (left.getSurname().equalsIgnoreCase(right.getSurname())
-				|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
-				// IN case on of the two Authors has no given Name the match is true
-				if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
+		// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
+		// We verify if we have an exact match between name and surname
+		if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
+			pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
 			return true;
-				// If the surname is correct, and they have the same name or the name starts with the same Letter we can
-				// say is the same author
-				if (left.getName().equalsIgnoreCase(right.getName())
-					|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
-					return true;
-			}
-			// Different SURNAME
-			else {
-				return false;
-			}
-		} else {
-			// This is the case where the two authors have or the surname or the fullname
-			// get the first not null of the surname or fullname of both
-			final String l = authorFieldToBeCompared(left);
-			final String r = authorFieldToBeCompared(right);
-			if (l == null || r == null)
-				return false;
-			// The same length means they are the same field
-			if (l.length() == r.length()) {
-				return normalize(l).equals(normalize(r));
-			}
-			// In this case probably l contains the surname and r contains the fullname
-			if (l.length() < r.length())
-				return normalize(r).contains(normalize(l));
-			// In this case probably l contains the fullname and r contains the surname
-			return normalize(l).contains(normalize(r));
-		}
+		else
 			return false;
 	}

 	public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {

-		final Integer match_itm = 0;
 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;

--- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
@ -9,6 +9,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Objects;
 import java.util.stream.Collectors;

 import org.junit.jupiter.api.Test;
@ -22,22 +23,15 @@ import eu.dnetlib.dhp.schema.oaf.Author;

 public class AuthorMergerTest {

+
 	@Test
-	public void testNormalization() {
-
-		assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
-		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
-		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
-
-	}
-
 	public void testEnrcichAuthor() throws Exception {
 		final ObjectMapper mapper = new ObjectMapper();

 		BufferedReader pr = new BufferedReader(new InputStreamReader(
-			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
+				Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json"))));
 		BufferedReader or = new BufferedReader(new InputStreamReader(
-			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
+				Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json"))));

 		TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
 		};
@ -93,32 +87,27 @@ public class AuthorMergerTest {
 						enrichedAuthorWithPid);

 				System.out.println("=================");
-
-				if (++i > 30)
-					break;
 			}
-
 		}
-
 	}

 	@Test
 	public void checkSimilarityTest() {
 		final Author left = new Author();
-		left.setSurname("Wu");
-		left.setName("M.");
-		left.setFullname("Wu, M.");
+		left.setName("Anand");
+		left.setSurname("Rachna");
+		left.setFullname("Anand, Rachna");

 		System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));

 		final Author right = new Author();
-		right.setName("Xin");
-		right.setSurname("Wu");
-		right.setFullname("Xin Wu");
+		right.setName("Rachna");
+		right.setSurname("Anand");
+		right.setFullname("Rachna, Anand");
 //        System.out.println(AuthorMerger.normalize(right.getFullname()));
 		boolean same = AuthorMerger.checkSimilarity2(left, right);

-		assertFalse(same);
+		assertTrue(same);

 	}

--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -27,6 +27,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]

  def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
    val orcidPublication = generateOrcidTable(spark, orcidPath)
+
+
    implicit val publicationEncoder = Encoders.bean(classOf[Publication])

    val aschema = new StructType()
@ -68,6 +70,16 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
      .select("dnet_id", "orcid_authors")
      .cache()

+
+    orcidPublication
+      .join(
+        entities,
+        lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
+          lower(col("value")).equalTo(lower(col("pid_value"))),
+        "inner"
+      )
+      .groupBy(col("dnet_id")).agg(collect_set(struct(col("pid_schema"), col("pid_value")))).write.mode("Overwrite").save("/user/sandro.labruzzo/enrich_pub")
+
    val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]

    publication
@ -95,13 +107,14 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
      .where(
        "identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
      )
-    orcidAuthors
+    val orcidPublication =orcidAuthors
      .join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
      .select(
        col("identifier.schema").alias("schema"),
        col("identifier.value").alias("value"),
        struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
      )
+    orcidPublication
  }
 }