1
0
Fork 0

code formatting

This commit is contained in:
Sandro La Bruzzo 2023-11-30 15:31:42 +01:00
parent 5e22b67b8a
commit cdfb7588dd
5 changed files with 268 additions and 300 deletions

View File

@ -4,352 +4,329 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import scala.Tuple2; import scala.Tuple2;
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
public int authorPosition = 0;
public int orcidPosition = 0;
public double maxColumnSimilarity = 0.0;
public SimilarityCellInfo() {
}
public void setValues(final int authPos, final int orcidPos, final double similarity) {
this.authorPosition = authPos;
this.orcidPosition = orcidPos;
this.maxColumnSimilarity = similarity;
}
@Override
public int compareTo(@NotNull SimilarityCellInfo o) {
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
}
}
public class AuthorMerger { public class AuthorMerger {
private static final Double THRESHOLD = 0.95; private static final Double THRESHOLD = 0.95;
private AuthorMerger() { private AuthorMerger() {
} }
public static List<Author> merge(List<List<Author>> authors) { public static List<Author> merge(List<List<Author>> authors) {
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
List<Author> author = new ArrayList<>(); List<Author> author = new ArrayList<>();
for (List<Author> a : authors) { for (List<Author> a : authors) {
author = mergeAuthor(author, a); author = mergeAuthor(author, a);
} }
return author; return author;
} }
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) { public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a); int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b); int pb = countAuthorsPids(b);
List<Author> base; List<Author> base;
List<Author> enrich; List<Author> enrich;
int sa = authorsSize(a); int sa = authorsSize(a);
int sb = authorsSize(b); int sb = authorsSize(b);
if (sa == sb) { if (sa == sb) {
base = pa > pb ? a : b; base = pa > pb ? a : b;
enrich = pa > pb ? b : a; enrich = pa > pb ? b : a;
} else { } else {
base = sa > sb ? a : b; base = sa > sb ? a : b;
enrich = sa > sb ? b : a; enrich = sa > sb ? b : a;
} }
enrichPidFromList(base, enrich, threshold); enrichPidFromList(base, enrich, threshold);
return base; return base;
} }
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) { public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
return mergeAuthor(a, b, THRESHOLD); return mergeAuthor(a, b, THRESHOLD);
} }
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) { private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
if (base == null || enrich == null) if (base == null || enrich == null)
return; return;
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list) // <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base final Map<String, Author> basePidAuthorMap = base
.stream() .stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap( .flatMap(
a -> a a -> a
.getPid() .getPid()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(p -> new Tuple2<>(pidToComparableString(p), a))) .map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list) // <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream() .stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap( .flatMap(
a -> a a -> a
.getPid() .getPid()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
.map(p -> new Tuple2<>(p, a))) .map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList()); .collect(Collectors.toList());
pidToEnrich pidToEnrich
.forEach( .forEach(
a -> { a -> {
Optional<Tuple2<Double, Author>> simAuthor = base Optional<Tuple2<Double, Author>> simAuthor = base
.stream() .stream()
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
.max(Comparator.comparing(Tuple2::_1)); .max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent()) { if (simAuthor.isPresent()) {
double th = threshold; double th = threshold;
// increase the threshold if the surname is too short // increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null if (simAuthor.get()._2().getSurname() != null
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
th = 0.99; th = 0.99;
if (simAuthor.get()._1() > th) { if (simAuthor.get()._1() > th) {
Author r = simAuthor.get()._2(); Author r = simAuthor.get()._2();
if (r.getPid() == null) { if (r.getPid() == null) {
r.setPid(new ArrayList<>()); r.setPid(new ArrayList<>());
} }
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
// it creates of fixed size, and the add method raise UnsupportedOperationException at // it creates of fixed size, and the add method raise UnsupportedOperationException at
// java.util.AbstractList.add // java.util.AbstractList.add
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid()); final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
tmp.add(a._1()); tmp.add(a._1());
r.setPid(tmp); r.setPid(tmp);
} }
} }
}); });
} }
public static String normalizeFullName(final String fullname) { public static String normalizeFullName(final String fullname) {
return nfd(fullname) return nfd(fullname)
.toLowerCase() .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError // do not compact the regexes in a single expression, would cause StackOverflowError
// in case // in case
// of large input strings // of large input strings
.replaceAll("(\\W)+", " ") .replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ") .replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ") .replaceAll("(\\n)+", " ")
.trim(); .trim();
} }
static int hammingDist(String str1, String str2) { static int hammingDist(String str1, String str2) {
if (str1.length() != str2.length()) if (str1.length() != str2.length())
return Math.max(str1.length(), str2.length()); return Math.max(str1.length(), str2.length());
int i = 0, count = 0; int i = 0, count = 0;
while (i < str1.length()) { while (i < str1.length()) {
if (str1.charAt(i) != str2.charAt(i)) if (str1.charAt(i) != str2.charAt(i))
count++; count++;
i++; i++;
} }
return count; return count;
} }
private static String authorFieldToBeCompared(Author author) { private static String authorFieldToBeCompared(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) { if (StringUtils.isNotBlank(author.getSurname())) {
return author.getSurname(); return author.getSurname();
} }
if (StringUtils.isNotBlank(author.getFullname())) { if (StringUtils.isNotBlank(author.getFullname())) {
return author.getFullname(); return author.getFullname();
} }
return null; return null;
} }
/** /**
* This method tries to figure out when two author are the same in the contest * This method tries to figure out when two author are the same in the contest
* of ORCID enrichment * of ORCID enrichment
* @param left Author in the OAF entity *
* @param right Author ORCID * @param left Author in the OAF entity
* @return based on a heuristic on the names of the authors if they are the same. * @param right Author ORCID
*/ * @return based on a heuristic on the names of the authors if they are the same.
public static boolean checkORCIDSimilarity(final Author left, final Author right) { */
final Person pl = parse(left); public static boolean checkORCIDSimilarity(final Author left, final Author right) {
final Person pr = parse(right); final Person pl = parse(left);
final Person pr = parse(right);
// If one of them didn't have a surname we verify if they have the fullName not empty // If one of them didn't have a surname we verify if they have the fullName not empty
// and verify if the normalized version is equal // and verify if the normalized version is equal
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) && if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) { pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
&& !pr.getFullname().isEmpty()) { && !pr.getFullname().isEmpty()) {
return pl return pl
.getFullname() .getFullname()
.stream() .stream()
.anyMatch( .anyMatch(
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr)))); fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
} else { } else {
return false; return false;
} }
} }
// The Authors have one surname in common // The Authors have one surname in common
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) { if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
// If one of them has only a surname and is the same we can say that they are the same author // If one of them has only a surname and is the same we can say that they are the same author
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) || if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank))) (pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
return true; return true;
// The authors have the same initials of Name in common // The authors have the same initials of Name in common
if (pl if (pl
.getName() .getName()
.stream() .stream()
.anyMatch( .anyMatch(
nl -> pr nl -> pr
.getName() .getName()
.stream() .stream()
.anyMatch(nr -> nr.equalsIgnoreCase(nl)))) .anyMatch(nr -> nr.equalsIgnoreCase(nl))))
return true; return true;
} }
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name // Sometimes we noticed that publication have author wrote in inverse order Surname, Name
// We verify if we have an exact match between name and surname // We verify if we have an exact match between name and surname
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) && if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl)))) pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
return true; return true;
else else
return false; return false;
} }
// //
/** /**
* Method to enrich ORCID information in one list of authors based on another list * Method to enrich ORCID information in one list of authors based on another list
* @param baseAuthor the Author List in the OAF Entity *
* @param orcidAuthor The list of ORCID Author intersected * @param baseAuthor the Author List in the OAF Entity
* @return The Author List of the OAF Entity enriched with the orcid Author * @param orcidAuthor The list of ORCID Author intersected
*/ * @return The Author List of the OAF Entity enriched with the orcid Author
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) { */
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
if (baseAuthor == null || baseAuthor.isEmpty()) if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor; return orcidAuthor;
if (orcidAuthor == null || orcidAuthor.isEmpty()) if (orcidAuthor == null || orcidAuthor.isEmpty())
return baseAuthor; return baseAuthor;
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10) if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
return baseAuthor; return baseAuthor;
final List<Author> oAuthor = new ArrayList<>(); final List<Author> oAuthor = new ArrayList<>();
oAuthor.addAll(orcidAuthor); oAuthor.addAll(orcidAuthor);
baseAuthor.forEach(ba -> { baseAuthor.forEach(ba -> {
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst(); Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
if (aMatch.isPresent()) { if (aMatch.isPresent()) {
final Author sameAuthor = aMatch.get(); final Author sameAuthor = aMatch.get();
addPid(ba, sameAuthor.getPid()); addPid(ba, sameAuthor.getPid());
oAuthor.remove(sameAuthor); oAuthor.remove(sameAuthor);
} }
}); });
return baseAuthor; return baseAuthor;
} }
private static void addPid(final Author a, final List<StructuredProperty> pids) { private static void addPid(final Author a, final List<StructuredProperty> pids) {
if (a.getPid() == null) { if (a.getPid() == null) {
a.setPid(new ArrayList<>()); a.setPid(new ArrayList<>());
} }
a.getPid().addAll(pids); a.getPid().addAll(pids);
} }
public static String pidToComparableString(StructuredProperty pid) { public static String pidToComparableString(StructuredProperty pid) {
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
: ""; : "";
return (pid.getQualifier() != null ? classid : "") return (pid.getQualifier() != null ? classid : "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
} }
public static int countAuthorsPids(List<Author> authors) { public static int countAuthorsPids(List<Author> authors) {
if (authors == null) if (authors == null)
return 0; return 0;
return (int) authors.stream().filter(AuthorMerger::hasPid).count(); return (int) authors.stream().filter(AuthorMerger::hasPid).count();
} }
private static int authorsSize(List<Author> authors) { private static int authorsSize(List<Author> authors) {
if (authors == null) if (authors == null)
return 0; return 0;
return authors.size(); return authors.size();
} }
private static Double sim(Author a, Author b) { private static Double sim(Author a, Author b) {
final Person pa = parse(a); final Person pa = parse(a);
final Person pb = parse(b); final Person pb = parse(b);
// if both are accurate (e.g. they have name and surname) // if both are accurate (e.g. they have name and surname)
if (pa.isAccurate() & pb.isAccurate()) { if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
} else { } else {
return new JaroWinkler() return new JaroWinkler()
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
} }
} }
private static boolean hasPid(Author a) { private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().isEmpty()) if (a == null || a.getPid() == null || a.getPid().isEmpty())
return false; return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
} }
private static Person parse(Author author) { private static Person parse(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) { if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false); return new Person(author.getSurname() + ", " + author.getName(), false);
} else { } else {
if (StringUtils.isNotBlank(author.getFullname())) if (StringUtils.isNotBlank(author.getFullname()))
return new Person(author.getFullname(), false); return new Person(author.getFullname(), false);
else else
return new Person("", false); return new Person("", false);
} }
} }
public static String normalize(final String s) { public static String normalize(final String s) {
String[] normalized = nfd(s) String[] normalized = nfd(s)
.toLowerCase() .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError // do not compact the regexes in a single expression, would cause StackOverflowError
// in case // in case
// of large input strings // of large input strings
.replaceAll("(\\W)+", " ") .replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ") .replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ") .replaceAll("(\\n)+", " ")
.trim() .trim()
.split(" "); .split(" ");
Arrays.sort(normalized); Arrays.sort(normalized);
return String.join(" ", normalized); return String.join(" ", normalized);
} }
private static String nfd(final String s) { private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD); return Normalizer.normalize(s, Normalizer.Form.NFD);
} }
} }

View File

@ -79,8 +79,8 @@ public class PrepareAffiliationRelationsTest {
.getPath(); .getPath();
String pubmedAffiliationRelationsPath = getClass() String pubmedAffiliationRelationsPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";

View File

@ -92,7 +92,6 @@ object SparkGenerateDoiBoost {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingDirPath/firstJoin") .save(s"$workingDirPath/firstJoin")
logger.info("Phase 2) Join Result with MAG") logger.info("Phase 2) Join Result with MAG")
val sj: Dataset[(String, Publication)] = val sj: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))

View File

@ -73,7 +73,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
col("id").alias("dnet_id") col("id").alias("dnet_id")
) )
val orcidDnet = orcidPublication val orcidDnet = orcidPublication
.join( .join(
entities, entities,
lower(col("schema")).equalTo(lower(col("pid_schema"))) && lower(col("schema")).equalTo(lower(col("pid_schema"))) &&

View File

@ -6,13 +6,10 @@ import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
class EnrichOrcidTest { class EnrichOrcidTest {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
def test() = { def test() = {
val spark = SparkSession.builder().master("local[*]").getOrCreate() val spark = SparkSession.builder().master("local[*]").getOrCreate()
// spark.sparkContext.setLogLevel("ERROR") // spark.sparkContext.setLogLevel("ERROR")
@ -63,8 +60,7 @@ class EnrichOrcidTest {
// }).filter(author => author != null) // }).filter(author => author != null)
// }) // })
Encoders
Encoders
import spark.implicits._ import spark.implicits._
// val enriched = spark.read // val enriched = spark.read
@ -76,10 +72,6 @@ class EnrichOrcidTest {
// //
// .show() // .show()
} }
} }