forked from D-Net/dnet-hadoop
code formatting
This commit is contained in:
parent
5e22b67b8a
commit
cdfb7588dd
|
@ -4,352 +4,329 @@ package eu.dnetlib.dhp.oa.merge;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
|
|
||||||
|
|
||||||
public int authorPosition = 0;
|
|
||||||
public int orcidPosition = 0;
|
|
||||||
|
|
||||||
public double maxColumnSimilarity = 0.0;
|
|
||||||
|
|
||||||
public SimilarityCellInfo() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setValues(final int authPos, final int orcidPos, final double similarity) {
|
|
||||||
this.authorPosition = authPos;
|
|
||||||
this.orcidPosition = orcidPos;
|
|
||||||
this.maxColumnSimilarity = similarity;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(@NotNull SimilarityCellInfo o) {
|
|
||||||
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public class AuthorMerger {
|
public class AuthorMerger {
|
||||||
|
|
||||||
private static final Double THRESHOLD = 0.95;
|
private static final Double THRESHOLD = 0.95;
|
||||||
|
|
||||||
private AuthorMerger() {
|
private AuthorMerger() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> merge(List<List<Author>> authors) {
|
public static List<Author> merge(List<List<Author>> authors) {
|
||||||
|
|
||||||
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
||||||
|
|
||||||
List<Author> author = new ArrayList<>();
|
List<Author> author = new ArrayList<>();
|
||||||
|
|
||||||
for (List<Author> a : authors) {
|
for (List<Author> a : authors) {
|
||||||
author = mergeAuthor(author, a);
|
author = mergeAuthor(author, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
return author;
|
return author;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
||||||
int pa = countAuthorsPids(a);
|
int pa = countAuthorsPids(a);
|
||||||
int pb = countAuthorsPids(b);
|
int pb = countAuthorsPids(b);
|
||||||
List<Author> base;
|
List<Author> base;
|
||||||
List<Author> enrich;
|
List<Author> enrich;
|
||||||
int sa = authorsSize(a);
|
int sa = authorsSize(a);
|
||||||
int sb = authorsSize(b);
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
if (sa == sb) {
|
if (sa == sb) {
|
||||||
base = pa > pb ? a : b;
|
base = pa > pb ? a : b;
|
||||||
enrich = pa > pb ? b : a;
|
enrich = pa > pb ? b : a;
|
||||||
} else {
|
} else {
|
||||||
base = sa > sb ? a : b;
|
base = sa > sb ? a : b;
|
||||||
enrich = sa > sb ? b : a;
|
enrich = sa > sb ? b : a;
|
||||||
}
|
}
|
||||||
enrichPidFromList(base, enrich, threshold);
|
enrichPidFromList(base, enrich, threshold);
|
||||||
return base;
|
return base;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||||
return mergeAuthor(a, b, THRESHOLD);
|
return mergeAuthor(a, b, THRESHOLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
||||||
if (base == null || enrich == null)
|
if (base == null || enrich == null)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
||||||
final Map<String, Author> basePidAuthorMap = base
|
final Map<String, Author> basePidAuthorMap = base
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
.flatMap(
|
.flatMap(
|
||||||
a -> a
|
a -> a
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
// <pid, Author> (list of pid that are missing in the other list)
|
// <pid, Author> (list of pid that are missing in the other list)
|
||||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
.flatMap(
|
.flatMap(
|
||||||
a -> a
|
a -> a
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
||||||
.map(p -> new Tuple2<>(p, a)))
|
.map(p -> new Tuple2<>(p, a)))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
pidToEnrich
|
pidToEnrich
|
||||||
.forEach(
|
.forEach(
|
||||||
a -> {
|
a -> {
|
||||||
Optional<Tuple2<Double, Author>> simAuthor = base
|
Optional<Tuple2<Double, Author>> simAuthor = base
|
||||||
.stream()
|
.stream()
|
||||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
|
|
||||||
if (simAuthor.isPresent()) {
|
if (simAuthor.isPresent()) {
|
||||||
double th = threshold;
|
double th = threshold;
|
||||||
// increase the threshold if the surname is too short
|
// increase the threshold if the surname is too short
|
||||||
if (simAuthor.get()._2().getSurname() != null
|
if (simAuthor.get()._2().getSurname() != null
|
||||||
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
||||||
th = 0.99;
|
th = 0.99;
|
||||||
|
|
||||||
if (simAuthor.get()._1() > th) {
|
if (simAuthor.get()._1() > th) {
|
||||||
Author r = simAuthor.get()._2();
|
Author r = simAuthor.get()._2();
|
||||||
if (r.getPid() == null) {
|
if (r.getPid() == null) {
|
||||||
r.setPid(new ArrayList<>());
|
r.setPid(new ArrayList<>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
||||||
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
||||||
// java.util.AbstractList.add
|
// java.util.AbstractList.add
|
||||||
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
||||||
tmp.add(a._1());
|
tmp.add(a._1());
|
||||||
r.setPid(tmp);
|
r.setPid(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String normalizeFullName(final String fullname) {
|
public static String normalizeFullName(final String fullname) {
|
||||||
return nfd(fullname)
|
return nfd(fullname)
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
// in case
|
// in case
|
||||||
// of large input strings
|
// of large input strings
|
||||||
.replaceAll("(\\W)+", " ")
|
.replaceAll("(\\W)+", " ")
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
.replaceAll("(\\d)+", " ")
|
.replaceAll("(\\d)+", " ")
|
||||||
.replaceAll("(\\n)+", " ")
|
.replaceAll("(\\n)+", " ")
|
||||||
|
|
||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
static int hammingDist(String str1, String str2) {
|
static int hammingDist(String str1, String str2) {
|
||||||
if (str1.length() != str2.length())
|
if (str1.length() != str2.length())
|
||||||
return Math.max(str1.length(), str2.length());
|
return Math.max(str1.length(), str2.length());
|
||||||
int i = 0, count = 0;
|
int i = 0, count = 0;
|
||||||
while (i < str1.length()) {
|
while (i < str1.length()) {
|
||||||
if (str1.charAt(i) != str2.charAt(i))
|
if (str1.charAt(i) != str2.charAt(i))
|
||||||
count++;
|
count++;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String authorFieldToBeCompared(Author author) {
|
private static String authorFieldToBeCompared(Author author) {
|
||||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
return author.getSurname();
|
return author.getSurname();
|
||||||
|
|
||||||
}
|
}
|
||||||
if (StringUtils.isNotBlank(author.getFullname())) {
|
if (StringUtils.isNotBlank(author.getFullname())) {
|
||||||
return author.getFullname();
|
return author.getFullname();
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method tries to figure out when two author are the same in the contest
|
* This method tries to figure out when two author are the same in the contest
|
||||||
* of ORCID enrichment
|
* of ORCID enrichment
|
||||||
* @param left Author in the OAF entity
|
*
|
||||||
* @param right Author ORCID
|
* @param left Author in the OAF entity
|
||||||
* @return based on a heuristic on the names of the authors if they are the same.
|
* @param right Author ORCID
|
||||||
*/
|
* @return based on a heuristic on the names of the authors if they are the same.
|
||||||
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
*/
|
||||||
final Person pl = parse(left);
|
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
||||||
final Person pr = parse(right);
|
final Person pl = parse(left);
|
||||||
|
final Person pr = parse(right);
|
||||||
|
|
||||||
// If one of them didn't have a surname we verify if they have the fullName not empty
|
// If one of them didn't have a surname we verify if they have the fullName not empty
|
||||||
// and verify if the normalized version is equal
|
// and verify if the normalized version is equal
|
||||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
||||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
||||||
|
|
||||||
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
||||||
&& !pr.getFullname().isEmpty()) {
|
&& !pr.getFullname().isEmpty()) {
|
||||||
return pl
|
return pl
|
||||||
.getFullname()
|
.getFullname()
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(
|
.anyMatch(
|
||||||
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// The Authors have one surname in common
|
// The Authors have one surname in common
|
||||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
||||||
|
|
||||||
// If one of them has only a surname and is the same we can say that they are the same author
|
// If one of them has only a surname and is the same we can say that they are the same author
|
||||||
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
||||||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
||||||
return true;
|
return true;
|
||||||
// The authors have the same initials of Name in common
|
// The authors have the same initials of Name in common
|
||||||
if (pl
|
if (pl
|
||||||
.getName()
|
.getName()
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(
|
.anyMatch(
|
||||||
nl -> pr
|
nl -> pr
|
||||||
.getName()
|
.getName()
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
|
.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
||||||
// We verify if we have an exact match between name and surname
|
// We verify if we have an exact match between name and surname
|
||||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
||||||
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method to enrich ORCID information in one list of authors based on another list
|
* Method to enrich ORCID information in one list of authors based on another list
|
||||||
* @param baseAuthor the Author List in the OAF Entity
|
*
|
||||||
* @param orcidAuthor The list of ORCID Author intersected
|
* @param baseAuthor the Author List in the OAF Entity
|
||||||
* @return The Author List of the OAF Entity enriched with the orcid Author
|
* @param orcidAuthor The list of ORCID Author intersected
|
||||||
*/
|
* @return The Author List of the OAF Entity enriched with the orcid Author
|
||||||
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
*/
|
||||||
|
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||||
|
|
||||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||||
return orcidAuthor;
|
return orcidAuthor;
|
||||||
|
|
||||||
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
||||||
return baseAuthor;
|
return baseAuthor;
|
||||||
|
|
||||||
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
||||||
return baseAuthor;
|
return baseAuthor;
|
||||||
|
|
||||||
final List<Author> oAuthor = new ArrayList<>();
|
final List<Author> oAuthor = new ArrayList<>();
|
||||||
oAuthor.addAll(orcidAuthor);
|
oAuthor.addAll(orcidAuthor);
|
||||||
|
|
||||||
baseAuthor.forEach(ba -> {
|
baseAuthor.forEach(ba -> {
|
||||||
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
||||||
if (aMatch.isPresent()) {
|
if (aMatch.isPresent()) {
|
||||||
final Author sameAuthor = aMatch.get();
|
final Author sameAuthor = aMatch.get();
|
||||||
addPid(ba, sameAuthor.getPid());
|
addPid(ba, sameAuthor.getPid());
|
||||||
oAuthor.remove(sameAuthor);
|
oAuthor.remove(sameAuthor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return baseAuthor;
|
return baseAuthor;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
||||||
|
|
||||||
if (a.getPid() == null) {
|
if (a.getPid() == null) {
|
||||||
a.setPid(new ArrayList<>());
|
a.setPid(new ArrayList<>());
|
||||||
}
|
}
|
||||||
|
|
||||||
a.getPid().addAll(pids);
|
a.getPid().addAll(pids);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
||||||
: "";
|
: "";
|
||||||
return (pid.getQualifier() != null ? classid : "")
|
return (pid.getQualifier() != null ? classid : "")
|
||||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int countAuthorsPids(List<Author> authors) {
|
public static int countAuthorsPids(List<Author> authors) {
|
||||||
if (authors == null)
|
if (authors == null)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return (int) authors.stream().filter(AuthorMerger::hasPid).count();
|
return (int) authors.stream().filter(AuthorMerger::hasPid).count();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int authorsSize(List<Author> authors) {
|
private static int authorsSize(List<Author> authors) {
|
||||||
if (authors == null)
|
if (authors == null)
|
||||||
return 0;
|
return 0;
|
||||||
return authors.size();
|
return authors.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Double sim(Author a, Author b) {
|
private static Double sim(Author a, Author b) {
|
||||||
|
|
||||||
final Person pa = parse(a);
|
final Person pa = parse(a);
|
||||||
final Person pb = parse(b);
|
final Person pb = parse(b);
|
||||||
|
|
||||||
// if both are accurate (e.g. they have name and surname)
|
// if both are accurate (e.g. they have name and surname)
|
||||||
if (pa.isAccurate() & pb.isAccurate()) {
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
||||||
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
||||||
} else {
|
} else {
|
||||||
return new JaroWinkler()
|
return new JaroWinkler()
|
||||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasPid(Author a) {
|
private static boolean hasPid(Author a) {
|
||||||
if (a == null || a.getPid() == null || a.getPid().isEmpty())
|
if (a == null || a.getPid() == null || a.getPid().isEmpty())
|
||||||
return false;
|
return false;
|
||||||
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Person parse(Author author) {
|
private static Person parse(Author author) {
|
||||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
return new Person(author.getSurname() + ", " + author.getName(), false);
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||||
} else {
|
} else {
|
||||||
if (StringUtils.isNotBlank(author.getFullname()))
|
if (StringUtils.isNotBlank(author.getFullname()))
|
||||||
return new Person(author.getFullname(), false);
|
return new Person(author.getFullname(), false);
|
||||||
else
|
else
|
||||||
return new Person("", false);
|
return new Person("", false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String normalize(final String s) {
|
public static String normalize(final String s) {
|
||||||
String[] normalized = nfd(s)
|
String[] normalized = nfd(s)
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
// in case
|
// in case
|
||||||
// of large input strings
|
// of large input strings
|
||||||
.replaceAll("(\\W)+", " ")
|
.replaceAll("(\\W)+", " ")
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
.replaceAll("(\\d)+", " ")
|
.replaceAll("(\\d)+", " ")
|
||||||
.replaceAll("(\\n)+", " ")
|
.replaceAll("(\\n)+", " ")
|
||||||
.trim()
|
.trim()
|
||||||
.split(" ");
|
.split(" ");
|
||||||
|
|
||||||
Arrays.sort(normalized);
|
Arrays.sort(normalized);
|
||||||
|
|
||||||
return String.join(" ", normalized);
|
return String.join(" ", normalized);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
private static String nfd(final String s) {
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,8 +79,8 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
String pubmedAffiliationRelationsPath = getClass()
|
String pubmedAffiliationRelationsPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
String outputPath = workingDir.toString() + "/actionSet";
|
String outputPath = workingDir.toString() + "/actionSet";
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,6 @@ object SparkGenerateDoiBoost {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingDirPath/firstJoin")
|
.save(s"$workingDirPath/firstJoin")
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 2) Join Result with MAG")
|
logger.info("Phase 2) Join Result with MAG")
|
||||||
val sj: Dataset[(String, Publication)] =
|
val sj: Dataset[(String, Publication)] =
|
||||||
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
|
@ -73,7 +73,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
col("id").alias("dnet_id")
|
col("id").alias("dnet_id")
|
||||||
)
|
)
|
||||||
|
|
||||||
val orcidDnet = orcidPublication
|
val orcidDnet = orcidPublication
|
||||||
.join(
|
.join(
|
||||||
entities,
|
entities,
|
||||||
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
|
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
|
||||||
|
|
|
@ -6,13 +6,10 @@ import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
|
|
||||||
class EnrichOrcidTest {
|
class EnrichOrcidTest {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test() = {
|
def test() = {
|
||||||
val spark = SparkSession.builder().master("local[*]").getOrCreate()
|
val spark = SparkSession.builder().master("local[*]").getOrCreate()
|
||||||
// spark.sparkContext.setLogLevel("ERROR")
|
// spark.sparkContext.setLogLevel("ERROR")
|
||||||
|
@ -63,8 +60,7 @@ class EnrichOrcidTest {
|
||||||
// }).filter(author => author != null)
|
// }).filter(author => author != null)
|
||||||
// })
|
// })
|
||||||
|
|
||||||
|
Encoders
|
||||||
Encoders
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
// val enriched = spark.read
|
// val enriched = spark.read
|
||||||
|
@ -76,10 +72,6 @@ class EnrichOrcidTest {
|
||||||
//
|
//
|
||||||
// .show()
|
// .show()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue