ORCID Enrichment and Download #364
|
@ -159,17 +159,11 @@ public class AuthorMerger {
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
.replaceAll("(\\d)+", " ")
|
.replaceAll("(\\d)+", " ")
|
||||||
.replaceAll("(\\n)+", " ")
|
.replaceAll("(\\n)+", " ")
|
||||||
|
|
||||||
.trim();
|
.trim();
|
||||||
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
|
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String generateAuthorkey(final Author a) {
|
|
||||||
if (a.getSurname() == null)
|
|
||||||
return "NOSURNAME";
|
|
||||||
|
|
||||||
return normalize(a.getSurname());
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||||
// if (baseAuthor == null || baseAuthor.isEmpty())
|
// if (baseAuthor == null || baseAuthor.isEmpty())
|
||||||
|
@ -226,54 +220,26 @@ public class AuthorMerger {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean checkSimilarity3(final Author left, final Author right) {
|
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
|
|
||||||
&&
|
|
||||||
StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
|
|
||||||
|
|
||||||
)
|
|
||||||
return left.getSurname().equalsIgnoreCase(right.getSurname())
|
|
||||||
&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
|
|
||||||
|
|
||||||
final Person pl = parse(left);
|
|
||||||
final Person pr = parse(right);
|
|
||||||
|
|
||||||
// If one of them didn't have a surname the match is false
|
|
||||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
|
||||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// The Authors have one surname in common
|
|
||||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
|
||||||
|
|
||||||
// If one of them has only a surname and is the same we can say that they are the same author
|
|
||||||
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
|
||||||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
|
||||||
return true;
|
|
||||||
// The authors have the same initials of Name in common
|
|
||||||
if (pl
|
|
||||||
.getName()
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
nl -> pr
|
|
||||||
.getName()
|
|
||||||
.stream()
|
|
||||||
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean checkSimilarity2(final Author left, final Author right) {
|
public static boolean checkSimilarity2(final Author left, final Author right) {
|
||||||
final Person pl = parse(left);
|
final Person pl = parse(left);
|
||||||
final Person pr = parse(right);
|
final Person pr = parse(right);
|
||||||
|
|
||||||
// If one of them didn't have a surname the match is false
|
// If one of them didn't have a surname we verify if they have the fullName not empty
|
||||||
|
// and verify if the normalized version is equal
|
||||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
||||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
|
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
||||||
return false;
|
|
||||||
|
|
||||||
|
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
||||||
|
&& !pr.getFullname().isEmpty()) {
|
||||||
|
return pl
|
||||||
|
.getFullname()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
// The Authors have one surname in common
|
// The Authors have one surname in common
|
||||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
||||||
|
|
||||||
|
@ -292,56 +258,18 @@ public class AuthorMerger {
|
||||||
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
|
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean checkSimilarity(final Author left, final Author right) {
|
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
||||||
|
// We verify if we have an exact match between name and surname
|
||||||
if (left.getSurname() == null && left.getFullname() == null)
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
||||||
|
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
||||||
|
return true;
|
||||||
|
else
|
||||||
return false;
|
return false;
|
||||||
if (right.getSurname() == null && right.getFullname() == null)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
|
|
||||||
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
|
|
||||||
if (left.getSurname().equalsIgnoreCase(right.getSurname())
|
|
||||||
|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
|
|
||||||
// IN case on of the two Authors has no given Name the match is true
|
|
||||||
if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
|
|
||||||
return true;
|
|
||||||
// If the surname is correct, and they have the same name or the name starts with the same Letter we can
|
|
||||||
// say is the same author
|
|
||||||
if (left.getName().equalsIgnoreCase(right.getName())
|
|
||||||
|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// Different SURNAME
|
|
||||||
else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// This is the case where the two authors have or the surname or the fullname
|
|
||||||
// get the first not null of the surname or fullname of both
|
|
||||||
final String l = authorFieldToBeCompared(left);
|
|
||||||
final String r = authorFieldToBeCompared(right);
|
|
||||||
if (l == null || r == null)
|
|
||||||
return false;
|
|
||||||
// The same length means they are the same field
|
|
||||||
if (l.length() == r.length()) {
|
|
||||||
return normalize(l).equals(normalize(r));
|
|
||||||
}
|
|
||||||
// In this case probably l contains the surname and r contains the fullname
|
|
||||||
if (l.length() < r.length())
|
|
||||||
return normalize(r).contains(normalize(l));
|
|
||||||
// In this case probably l contains the fullname and r contains the surname
|
|
||||||
return normalize(l).contains(normalize(r));
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||||
|
|
||||||
final Integer match_itm = 0;
|
|
||||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||||
return orcidAuthor;
|
return orcidAuthor;
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
@ -22,22 +23,15 @@ import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
|
||||||
public class AuthorMergerTest {
|
public class AuthorMergerTest {
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testNormalization() {
|
|
||||||
|
|
||||||
assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
|
|
||||||
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
|
|
||||||
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testEnrcichAuthor() throws Exception {
|
public void testEnrcichAuthor() throws Exception {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
BufferedReader pr = new BufferedReader(new InputStreamReader(
|
BufferedReader pr = new BufferedReader(new InputStreamReader(
|
||||||
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
|
Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json"))));
|
||||||
BufferedReader or = new BufferedReader(new InputStreamReader(
|
BufferedReader or = new BufferedReader(new InputStreamReader(
|
||||||
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
|
Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json"))));
|
||||||
|
|
||||||
TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
|
TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
|
||||||
};
|
};
|
||||||
|
@ -93,32 +87,27 @@ public class AuthorMergerTest {
|
||||||
enrichedAuthorWithPid);
|
enrichedAuthorWithPid);
|
||||||
|
|
||||||
System.out.println("=================");
|
System.out.println("=================");
|
||||||
|
|
||||||
if (++i > 30)
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void checkSimilarityTest() {
|
public void checkSimilarityTest() {
|
||||||
final Author left = new Author();
|
final Author left = new Author();
|
||||||
left.setSurname("Wu");
|
left.setName("Anand");
|
||||||
left.setName("M.");
|
left.setSurname("Rachna");
|
||||||
left.setFullname("Wu, M.");
|
left.setFullname("Anand, Rachna");
|
||||||
|
|
||||||
System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
|
System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
|
||||||
|
|
||||||
final Author right = new Author();
|
final Author right = new Author();
|
||||||
right.setName("Xin");
|
right.setName("Rachna");
|
||||||
right.setSurname("Wu");
|
right.setSurname("Anand");
|
||||||
right.setFullname("Xin Wu");
|
right.setFullname("Rachna, Anand");
|
||||||
// System.out.println(AuthorMerger.normalize(right.getFullname()));
|
// System.out.println(AuthorMerger.normalize(right.getFullname()));
|
||||||
boolean same = AuthorMerger.checkSimilarity2(left, right);
|
boolean same = AuthorMerger.checkSimilarity2(left, right);
|
||||||
|
|
||||||
assertFalse(same);
|
assertTrue(same);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -27,6 +27,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
|
|
||||||
def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
|
def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
|
||||||
val orcidPublication = generateOrcidTable(spark, orcidPath)
|
val orcidPublication = generateOrcidTable(spark, orcidPath)
|
||||||
|
|
||||||
|
|
||||||
implicit val publicationEncoder = Encoders.bean(classOf[Publication])
|
implicit val publicationEncoder = Encoders.bean(classOf[Publication])
|
||||||
|
|
||||||
val aschema = new StructType()
|
val aschema = new StructType()
|
||||||
|
@ -68,6 +70,16 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
.select("dnet_id", "orcid_authors")
|
.select("dnet_id", "orcid_authors")
|
||||||
.cache()
|
.cache()
|
||||||
|
|
||||||
|
|
||||||
|
orcidPublication
|
||||||
|
.join(
|
||||||
|
entities,
|
||||||
|
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
|
||||||
|
lower(col("value")).equalTo(lower(col("pid_value"))),
|
||||||
|
"inner"
|
||||||
|
)
|
||||||
|
.groupBy(col("dnet_id")).agg(collect_set(struct(col("pid_schema"), col("pid_value")))).write.mode("Overwrite").save("/user/sandro.labruzzo/enrich_pub")
|
||||||
|
|
||||||
val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
|
val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
|
||||||
|
|
||||||
publication
|
publication
|
||||||
|
@ -95,13 +107,14 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
.where(
|
.where(
|
||||||
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
|
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
|
||||||
)
|
)
|
||||||
orcidAuthors
|
val orcidPublication =orcidAuthors
|
||||||
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
|
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
|
||||||
.select(
|
.select(
|
||||||
|
|||||||
col("identifier.schema").alias("schema"),
|
col("identifier.schema").alias("schema"),
|
||||||
col("identifier.value").alias("value"),
|
col("identifier.value").alias("value"),
|
||||||
struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
|
struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
|
||||||
)
|
)
|
||||||
|
orcidPublication
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
a shorter form is identifier.schema IN ('doi', 'pmid', ...)
Thanks you @giambattista.bloisi I'll update the code