2023-12-01 15:05:45 +01:00 · 2023-12-01 11:27:11 +01:00 · 2023-12-01 12:15:17 +01:00
5 changed files with 51 additions and 116 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -159,17 +159,11 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim();
 //        return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
 	}
 	private static String generateAuthorkey(final Author a) {
 		if (a.getSurname() == null)
 			return "NOSURNAME";
 		return normalize(a.getSurname());
 	}
 //
 //    public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
 //        if (baseAuthor == null || baseAuthor.isEmpty())
@ -226,54 +220,26 @@ public class AuthorMerger {
 		return null;
 	}
 	public static boolean checkSimilarity3(final Author left, final Author right) {
 		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
 			&&
 			StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
 		)
 			return left.getSurname().equalsIgnoreCase(right.getSurname())
 				&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
 		final Person pl = parse(left);
 		final Person pr = parse(right);
 		// If one of them didn't have a surname the match is false
 		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
 			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
 			return false;
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
 			// If one of them has only a surname and is the same we can say that they are the same author
 			if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
 				(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
 				return true;
 			// The authors have the same initials of Name in common
 			if (pl
 				.getName()
 				.stream()
 				.anyMatch(
 					nl -> pr
 						.getName()
 						.stream()
 						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
 				return true;
 		}
 		return false;
 	}
 	public static boolean checkSimilarity2(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);
-		// If one of them didn't have a surname the match is false
+		// If one of them didn't have a surname we verify if they have the fullName not empty
 		// and verify if the normalized version is equal
 		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
-			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
+			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
 			return false;
 			if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
 				&& !pr.getFullname().isEmpty()) {
 				return pl
 					.getFullname()
 					.stream()
 					.anyMatch(
 						fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
 			} else {
 				return false;
 			}
 		}
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
@ -292,56 +258,18 @@ public class AuthorMerger {
 						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
 				return true;
 		}
 		return false;
 	}
-	public static boolean checkSimilarity(final Author left, final Author right) {
+		// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
-
+		// We verify if we have an exact match between name and surname
-		if (left.getSurname() == null && left.getFullname() == null)
+		if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
-			return false;
+			pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
 		if (right.getSurname() == null && right.getFullname() == null)
 			return false;
 		// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
 		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
 			if (left.getSurname().equalsIgnoreCase(right.getSurname())
 				|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
 				// IN case on of the two Authors has no given Name the match is true
 				if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
 			return true;
-				// If the surname is correct, and they have the same name or the name starts with the same Letter we can
+		else
 				// say is the same author
 				if (left.getName().equalsIgnoreCase(right.getName())
 					|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
 					return true;
 			}
 			// Different SURNAME
 			else {
 				return false;
 			}
 		} else {
 			// This is the case where the two authors have or the surname or the fullname
 			// get the first not null of the surname or fullname of both
 			final String l = authorFieldToBeCompared(left);
 			final String r = authorFieldToBeCompared(right);
 			if (l == null || r == null)
 				return false;
 			// The same length means they are the same field
 			if (l.length() == r.length()) {
 				return normalize(l).equals(normalize(r));
 			}
 			// In this case probably l contains the surname and r contains the fullname
 			if (l.length() < r.length())
 				return normalize(r).contains(normalize(l));
 			// In this case probably l contains the fullname and r contains the surname
 			return normalize(l).contains(normalize(r));
 		}
 			return false;
 	}
 	public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
 		final Integer match_itm = 0;
 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;
--- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
@ -9,6 +9,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
 import java.util.stream.Collectors;
 import org.junit.jupiter.api.Test;
@ -22,22 +23,15 @@ import eu.dnetlib.dhp.schema.oaf.Author;
 public class AuthorMergerTest {
 	@Test
 	public void testNormalization() {
 		assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
 		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
 		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
 	}
 	public void testEnrcichAuthor() throws Exception {
 		final ObjectMapper mapper = new ObjectMapper();
 		BufferedReader pr = new BufferedReader(new InputStreamReader(
-			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
+				Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json"))));
 		BufferedReader or = new BufferedReader(new InputStreamReader(
-			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
+				Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json"))));
 		TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
 		};
@ -93,32 +87,27 @@ public class AuthorMergerTest {
 						enrichedAuthorWithPid);
 				System.out.println("=================");
 				if (++i > 30)
 					break;
 			}
 		}
 	}
 	@Test
 	public void checkSimilarityTest() {
 		final Author left = new Author();
-		left.setSurname("Wu");
+		left.setName("Anand");
-		left.setName("M.");
+		left.setSurname("Rachna");
-		left.setFullname("Wu, M.");
+		left.setFullname("Anand, Rachna");
 		System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
 		final Author right = new Author();
-		right.setName("Xin");
+		right.setName("Rachna");
-		right.setSurname("Wu");
+		right.setSurname("Anand");
-		right.setFullname("Xin Wu");
+		right.setFullname("Rachna, Anand");
 //        System.out.println(AuthorMerger.normalize(right.getFullname()));
 		boolean same = AuthorMerger.checkSimilarity2(left, right);
-		assertFalse(same);
+		assertTrue(same);
 	}
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -27,6 +27,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
  def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
    val orcidPublication = generateOrcidTable(spark, orcidPath)
    implicit val publicationEncoder = Encoders.bean(classOf[Publication])
    val aschema = new StructType()
@ -68,6 +70,16 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
      .select("dnet_id", "orcid_authors")
      .cache()
    orcidPublication
      .join(
        entities,
        lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
          lower(col("value")).equalTo(lower(col("pid_value"))),
        "inner"
      )
      .groupBy(col("dnet_id")).agg(collect_set(struct(col("pid_schema"), col("pid_value")))).write.mode("Overwrite").save("/user/sandro.labruzzo/enrich_pub")
    val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
    publication
@ -95,13 +107,14 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
      .where(
        "identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
      )
-    orcidAuthors
+    val orcidPublication =orcidAuthors
      .join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
      .select(
        col("identifier.schema").alias("schema"),
        col("identifier.value").alias("value"),
        struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
      )
    orcidPublication
  }
 }