forked from D-Net/dnet-hadoop
Implemented Author MErger for ORCID that takes in account the case when name and surname are swapped
This commit is contained in:
parent
34a4b3cbdf
commit
6f4d0c05ea
|
@ -159,17 +159,11 @@ public class AuthorMerger {
|
|||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
|
||||
.trim();
|
||||
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
|
||||
}
|
||||
|
||||
private static String generateAuthorkey(final Author a) {
|
||||
if (a.getSurname() == null)
|
||||
return "NOSURNAME";
|
||||
|
||||
return normalize(a.getSurname());
|
||||
}
|
||||
|
||||
//
|
||||
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
// if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
|
@ -226,54 +220,26 @@ public class AuthorMerger {
|
|||
return null;
|
||||
}
|
||||
|
||||
public static boolean checkSimilarity3(final Author left, final Author right) {
|
||||
|
||||
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
|
||||
&&
|
||||
StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
|
||||
|
||||
)
|
||||
return left.getSurname().equalsIgnoreCase(right.getSurname())
|
||||
&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
|
||||
|
||||
final Person pl = parse(left);
|
||||
final Person pr = parse(right);
|
||||
|
||||
// If one of them didn't have a surname the match is false
|
||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
|
||||
return false;
|
||||
|
||||
// The Authors have one surname in common
|
||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
||||
|
||||
// If one of them has only a surname and is the same we can say that they are the same author
|
||||
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
||||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
||||
return true;
|
||||
// The authors have the same initials of Name in common
|
||||
if (pl
|
||||
.getName()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
nl -> pr
|
||||
.getName()
|
||||
.stream()
|
||||
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean checkSimilarity2(final Author left, final Author right) {
|
||||
final Person pl = parse(left);
|
||||
final Person pr = parse(right);
|
||||
|
||||
// If one of them didn't have a surname the match is false
|
||||
// If one of them didn't have a surname we verify if they have the fullName not empty
|
||||
// and verify if the normalized version is equal
|
||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
|
||||
return false;
|
||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
||||
|
||||
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
||||
&& !pr.getFullname().isEmpty()) {
|
||||
return pl
|
||||
.getFullname()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// The Authors have one surname in common
|
||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
||||
|
||||
|
@ -292,56 +258,18 @@ public class AuthorMerger {
|
|||
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean checkSimilarity(final Author left, final Author right) {
|
||||
|
||||
if (left.getSurname() == null && left.getFullname() == null)
|
||||
return false;
|
||||
if (right.getSurname() == null && right.getFullname() == null)
|
||||
return false;
|
||||
|
||||
// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
|
||||
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
|
||||
if (left.getSurname().equalsIgnoreCase(right.getSurname())
|
||||
|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
|
||||
// IN case on of the two Authors has no given Name the match is true
|
||||
if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
|
||||
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
||||
// We verify if we have an exact match between name and surname
|
||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
||||
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
||||
return true;
|
||||
// If the surname is correct, and they have the same name or the name starts with the same Letter we can
|
||||
// say is the same author
|
||||
if (left.getName().equalsIgnoreCase(right.getName())
|
||||
|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
|
||||
return true;
|
||||
}
|
||||
// Different SURNAME
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// This is the case where the two authors have or the surname or the fullname
|
||||
// get the first not null of the surname or fullname of both
|
||||
final String l = authorFieldToBeCompared(left);
|
||||
final String r = authorFieldToBeCompared(right);
|
||||
if (l == null || r == null)
|
||||
return false;
|
||||
// The same length means they are the same field
|
||||
if (l.length() == r.length()) {
|
||||
return normalize(l).equals(normalize(r));
|
||||
}
|
||||
// In this case probably l contains the surname and r contains the fullname
|
||||
if (l.length() < r.length())
|
||||
return normalize(r).contains(normalize(l));
|
||||
// In this case probably l contains the fullname and r contains the surname
|
||||
return normalize(l).contains(normalize(r));
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
|
||||
final Integer match_itm = 0;
|
||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
return orcidAuthor;
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ import java.io.IOException;
|
|||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -22,22 +23,15 @@ import eu.dnetlib.dhp.schema.oaf.Author;
|
|||
|
||||
public class AuthorMergerTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testNormalization() {
|
||||
|
||||
assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
|
||||
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
|
||||
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
|
||||
|
||||
}
|
||||
|
||||
public void testEnrcichAuthor() throws Exception {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
BufferedReader pr = new BufferedReader(new InputStreamReader(
|
||||
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
|
||||
Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication_sample.json"))));
|
||||
BufferedReader or = new BufferedReader(new InputStreamReader(
|
||||
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
|
||||
Objects.requireNonNull(AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid_sample.json"))));
|
||||
|
||||
TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
|
||||
};
|
||||
|
@ -93,32 +87,27 @@ public class AuthorMergerTest {
|
|||
enrichedAuthorWithPid);
|
||||
|
||||
System.out.println("=================");
|
||||
|
||||
if (++i > 30)
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkSimilarityTest() {
|
||||
final Author left = new Author();
|
||||
left.setSurname("Wu");
|
||||
left.setName("M.");
|
||||
left.setFullname("Wu, M.");
|
||||
left.setName("Anand");
|
||||
left.setSurname("Rachna");
|
||||
left.setFullname("Anand, Rachna");
|
||||
|
||||
System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
|
||||
|
||||
final Author right = new Author();
|
||||
right.setName("Xin");
|
||||
right.setSurname("Wu");
|
||||
right.setFullname("Xin Wu");
|
||||
right.setName("Rachna");
|
||||
right.setSurname("Anand");
|
||||
right.setFullname("Rachna, Anand");
|
||||
// System.out.println(AuthorMerger.normalize(right.getFullname()));
|
||||
boolean same = AuthorMerger.checkSimilarity2(left, right);
|
||||
|
||||
assertFalse(same);
|
||||
assertTrue(same);
|
||||
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -27,6 +27,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
|||
|
||||
def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
|
||||
val orcidPublication = generateOrcidTable(spark, orcidPath)
|
||||
|
||||
|
||||
implicit val publicationEncoder = Encoders.bean(classOf[Publication])
|
||||
|
||||
val aschema = new StructType()
|
||||
|
@ -68,6 +70,16 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
|||
.select("dnet_id", "orcid_authors")
|
||||
.cache()
|
||||
|
||||
|
||||
orcidPublication
|
||||
.join(
|
||||
entities,
|
||||
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
|
||||
lower(col("value")).equalTo(lower(col("pid_value"))),
|
||||
"inner"
|
||||
)
|
||||
.groupBy(col("dnet_id")).agg(collect_set(struct(col("pid_schema"), col("pid_value")))).write.mode("Overwrite").save("/user/sandro.labruzzo/enrich_pub")
|
||||
|
||||
val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
|
||||
|
||||
publication
|
||||
|
@ -95,13 +107,14 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
|||
.where(
|
||||
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
|
||||
)
|
||||
orcidAuthors
|
||||
val orcidPublication =orcidAuthors
|
||||
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
|
||||
.select(
|
||||
col("identifier.schema").alias("schema"),
|
||||
col("identifier.value").alias("value"),
|
||||
struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
|
||||
)
|
||||
orcidPublication
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue