ORCID Enrichment and Download #364

Merged
claudio.atzori merged 14 commits from orcid_import into beta 2023-12-01 15:05:45 +01:00
5 changed files with 51 additions and 116 deletions
Showing only changes of commit 6f4d0c05ea - Show all commits

View File

@ -159,17 +159,11 @@ public class AuthorMerger {
.replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ") .replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ") .replaceAll("(\\n)+", " ")
.trim(); .trim();
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining()); // return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
} }
private static String generateAuthorkey(final Author a) {
if (a.getSurname() == null)
return "NOSURNAME";
return normalize(a.getSurname());
}
// //
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) { // public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
// if (baseAuthor == null || baseAuthor.isEmpty()) // if (baseAuthor == null || baseAuthor.isEmpty())
@ -226,54 +220,26 @@ public class AuthorMerger {
return null; return null;
} }
public static boolean checkSimilarity3(final Author left, final Author right) {
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
&&
StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
)
return left.getSurname().equalsIgnoreCase(right.getSurname())
&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
final Person pl = parse(left);
final Person pr = parse(right);
// If one of them didn't have a surname the match is false
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
return false;
// The Authors have one surname in common
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
// If one of them has only a surname and is the same we can say that they are the same author
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
return true;
// The authors have the same initials of Name in common
if (pl
.getName()
.stream()
.anyMatch(
nl -> pr
.getName()
.stream()
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
return true;
}
return false;
}
public static boolean checkSimilarity2(final Author left, final Author right) { public static boolean checkSimilarity2(final Author left, final Author right) {
final Person pl = parse(left); final Person pl = parse(left);
final Person pr = parse(right); final Person pr = parse(right);
// If one of them didn't have a surname the match is false // If one of them didn't have a surname we verify if they have the fullName not empty
// and verify if the normalized version is equal
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) && if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
return false;
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
&& !pr.getFullname().isEmpty()) {
return pl
.getFullname()
.stream()
.anyMatch(
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
} else {
return false;
}
}
// The Authors have one surname in common // The Authors have one surname in common
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) { if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
@ -292,56 +258,18 @@ public class AuthorMerger {
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1))))) .anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
return true; return true;
} }
return false;
}
public static boolean checkSimilarity(final Author left, final Author right) { // Sometimes we noticed that publication have author wrote in inverse order Surname, Name
// We verify if we have an exact match between name and surname
if (left.getSurname() == null && left.getFullname() == null) if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
return false; pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
if (right.getSurname() == null && right.getFullname() == null)
return false;
// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
if (left.getSurname().equalsIgnoreCase(right.getSurname())
|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
// IN case on of the two Authors has no given Name the match is true
if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
return true; return true;
// If the surname is correct, and they have the same name or the name starts with the same Letter we can else
// say is the same author
if (left.getName().equalsIgnoreCase(right.getName())
|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
return true;
}
// Different SURNAME
else {
return false;
}
} else {
// This is the case where the two authors have or the surname or the fullname
// get the first not null of the surname or fullname of both
final String l = authorFieldToBeCompared(left);
final String r = authorFieldToBeCompared(right);
if (l == null || r == null)
return false;
// The same length means they are the same field
if (l.length() == r.length()) {
return normalize(l).equals(normalize(r));
}
// In this case probably l contains the surname and r contains the fullname
if (l.length() < r.length())
return normalize(r).contains(normalize(l));
// In this case probably l contains the fullname and r contains the surname
return normalize(l).contains(normalize(r));
}
return false; return false;
} }
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) { public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
final Integer match_itm = 0;
if (baseAuthor == null || baseAuthor.isEmpty()) if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor; return orcidAuthor;

View File

@ -9,6 +9,7 @@ import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -22,22 +23,15 @@ import eu.dnetlib.dhp.schema.oaf.Author;
public class AuthorMergerTest { public class AuthorMergerTest {
@Test @Test
public void testNormalization() {
assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
}
public void testEnrcichAuthor() throws Exception { public void testEnrcichAuthor() throws Exception {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
BufferedReader pr = new BufferedReader(new InputStreamReader( BufferedReader pr = new BufferedReader(new InputStreamReader(
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json"))); Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json"))));
BufferedReader or = new BufferedReader(new InputStreamReader( BufferedReader or = new BufferedReader(new InputStreamReader(
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json"))); Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json"))));
TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() { TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
}; };
@ -93,32 +87,27 @@ public class AuthorMergerTest {
enrichedAuthorWithPid); enrichedAuthorWithPid);
System.out.println("================="); System.out.println("=================");
if (++i > 30)
break;
} }
} }
} }
@Test @Test
public void checkSimilarityTest() { public void checkSimilarityTest() {
final Author left = new Author(); final Author left = new Author();
left.setSurname("Wu"); left.setName("Anand");
left.setName("M."); left.setSurname("Rachna");
left.setFullname("Wu, M."); left.setFullname("Anand, Rachna");
System.out.println(AuthorMerger.normalizeFullName(left.getFullname())); System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
final Author right = new Author(); final Author right = new Author();
right.setName("Xin"); right.setName("Rachna");
right.setSurname("Wu"); right.setSurname("Anand");
right.setFullname("Xin Wu"); right.setFullname("Rachna, Anand");
// System.out.println(AuthorMerger.normalize(right.getFullname())); // System.out.println(AuthorMerger.normalize(right.getFullname()));
boolean same = AuthorMerger.checkSimilarity2(left, right); boolean same = AuthorMerger.checkSimilarity2(left, right);
assertFalse(same); assertTrue(same);
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -27,6 +27,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = { def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
val orcidPublication = generateOrcidTable(spark, orcidPath) val orcidPublication = generateOrcidTable(spark, orcidPath)
implicit val publicationEncoder = Encoders.bean(classOf[Publication]) implicit val publicationEncoder = Encoders.bean(classOf[Publication])
val aschema = new StructType() val aschema = new StructType()
@ -68,6 +70,16 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
.select("dnet_id", "orcid_authors") .select("dnet_id", "orcid_authors")
.cache() .cache()
orcidPublication
.join(
entities,
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
lower(col("value")).equalTo(lower(col("pid_value"))),
"inner"
)
.groupBy(col("dnet_id")).agg(collect_set(struct(col("pid_schema"), col("pid_value")))).write.mode("Overwrite").save("/user/sandro.labruzzo/enrich_pub")
val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication] val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
publication publication
@ -95,13 +107,14 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
.where( .where(
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'" "identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
) )
orcidAuthors val orcidPublication =orcidAuthors
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid"))) .join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
.select( .select(

a shorter form is identifier.schema IN ('doi', 'pmid', ...)

a shorter form is identifier.schema IN ('doi', 'pmid', ...)
Review

Thanks you @giambattista.bloisi I'll update the code

Thanks you @giambattista.bloisi I'll update the code
col("identifier.schema").alias("schema"), col("identifier.schema").alias("schema"),
col("identifier.value").alias("value"), col("identifier.value").alias("value"),
struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author") struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
) )
orcidPublication
} }
} }