|
|
|
@ -4,352 +4,329 @@ package eu.dnetlib.dhp.oa.merge;
|
|
|
|
|
import java.text.Normalizer;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
import org.jetbrains.annotations.NotNull;
|
|
|
|
|
|
|
|
|
|
import com.wcohen.ss.JaroWinkler;
|
|
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
|
|
|
import eu.dnetlib.pace.model.Person;
|
|
|
|
|
import scala.Tuple2;
|
|
|
|
|
|
|
|
|
|
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
|
|
|
|
|
|
|
|
|
|
public int authorPosition = 0;
|
|
|
|
|
public int orcidPosition = 0;
|
|
|
|
|
|
|
|
|
|
public double maxColumnSimilarity = 0.0;
|
|
|
|
|
|
|
|
|
|
public SimilarityCellInfo() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setValues(final int authPos, final int orcidPos, final double similarity) {
|
|
|
|
|
this.authorPosition = authPos;
|
|
|
|
|
this.orcidPosition = orcidPos;
|
|
|
|
|
this.maxColumnSimilarity = similarity;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public int compareTo(@NotNull SimilarityCellInfo o) {
|
|
|
|
|
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class AuthorMerger {
|
|
|
|
|
|
|
|
|
|
private static final Double THRESHOLD = 0.95;
|
|
|
|
|
private static final Double THRESHOLD = 0.95;
|
|
|
|
|
|
|
|
|
|
private AuthorMerger() {
|
|
|
|
|
}
|
|
|
|
|
private AuthorMerger() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static List<Author> merge(List<List<Author>> authors) {
|
|
|
|
|
public static List<Author> merge(List<List<Author>> authors) {
|
|
|
|
|
|
|
|
|
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
|
|
|
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
|
|
|
|
|
|
|
|
|
List<Author> author = new ArrayList<>();
|
|
|
|
|
List<Author> author = new ArrayList<>();
|
|
|
|
|
|
|
|
|
|
for (List<Author> a : authors) {
|
|
|
|
|
author = mergeAuthor(author, a);
|
|
|
|
|
}
|
|
|
|
|
for (List<Author> a : authors) {
|
|
|
|
|
author = mergeAuthor(author, a);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return author;
|
|
|
|
|
return author;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
|
|
|
|
int pa = countAuthorsPids(a);
|
|
|
|
|
int pb = countAuthorsPids(b);
|
|
|
|
|
List<Author> base;
|
|
|
|
|
List<Author> enrich;
|
|
|
|
|
int sa = authorsSize(a);
|
|
|
|
|
int sb = authorsSize(b);
|
|
|
|
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
|
|
|
|
int pa = countAuthorsPids(a);
|
|
|
|
|
int pb = countAuthorsPids(b);
|
|
|
|
|
List<Author> base;
|
|
|
|
|
List<Author> enrich;
|
|
|
|
|
int sa = authorsSize(a);
|
|
|
|
|
int sb = authorsSize(b);
|
|
|
|
|
|
|
|
|
|
if (sa == sb) {
|
|
|
|
|
base = pa > pb ? a : b;
|
|
|
|
|
enrich = pa > pb ? b : a;
|
|
|
|
|
} else {
|
|
|
|
|
base = sa > sb ? a : b;
|
|
|
|
|
enrich = sa > sb ? b : a;
|
|
|
|
|
}
|
|
|
|
|
enrichPidFromList(base, enrich, threshold);
|
|
|
|
|
return base;
|
|
|
|
|
}
|
|
|
|
|
if (sa == sb) {
|
|
|
|
|
base = pa > pb ? a : b;
|
|
|
|
|
enrich = pa > pb ? b : a;
|
|
|
|
|
} else {
|
|
|
|
|
base = sa > sb ? a : b;
|
|
|
|
|
enrich = sa > sb ? b : a;
|
|
|
|
|
}
|
|
|
|
|
enrichPidFromList(base, enrich, threshold);
|
|
|
|
|
return base;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
|
|
|
|
return mergeAuthor(a, b, THRESHOLD);
|
|
|
|
|
}
|
|
|
|
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
|
|
|
|
return mergeAuthor(a, b, THRESHOLD);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
|
|
|
|
if (base == null || enrich == null)
|
|
|
|
|
return;
|
|
|
|
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
|
|
|
|
if (base == null || enrich == null)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
|
|
|
|
final Map<String, Author> basePidAuthorMap = base
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
|
|
|
|
.flatMap(
|
|
|
|
|
a -> a
|
|
|
|
|
.getPid()
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
|
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
|
|
|
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
|
|
|
|
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
|
|
|
|
final Map<String, Author> basePidAuthorMap = base
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
|
|
|
|
.flatMap(
|
|
|
|
|
a -> a
|
|
|
|
|
.getPid()
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
|
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
|
|
|
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
|
|
|
|
|
|
|
|
|
// <pid, Author> (list of pid that are missing in the other list)
|
|
|
|
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
|
|
|
|
.flatMap(
|
|
|
|
|
a -> a
|
|
|
|
|
.getPid()
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
|
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
|
|
|
|
.map(p -> new Tuple2<>(p, a)))
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
// <pid, Author> (list of pid that are missing in the other list)
|
|
|
|
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
|
|
|
|
.flatMap(
|
|
|
|
|
a -> a
|
|
|
|
|
.getPid()
|
|
|
|
|
.stream()
|
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
|
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
|
|
|
|
.map(p -> new Tuple2<>(p, a)))
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
|
|
|
|
|
pidToEnrich
|
|
|
|
|
.forEach(
|
|
|
|
|
a -> {
|
|
|
|
|
Optional<Tuple2<Double, Author>> simAuthor = base
|
|
|
|
|
.stream()
|
|
|
|
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
|
|
|
|
.max(Comparator.comparing(Tuple2::_1));
|
|
|
|
|
pidToEnrich
|
|
|
|
|
.forEach(
|
|
|
|
|
a -> {
|
|
|
|
|
Optional<Tuple2<Double, Author>> simAuthor = base
|
|
|
|
|
.stream()
|
|
|
|
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
|
|
|
|
.max(Comparator.comparing(Tuple2::_1));
|
|
|
|
|
|
|
|
|
|
if (simAuthor.isPresent()) {
|
|
|
|
|
double th = threshold;
|
|
|
|
|
// increase the threshold if the surname is too short
|
|
|
|
|
if (simAuthor.get()._2().getSurname() != null
|
|
|
|
|
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
|
|
|
|
th = 0.99;
|
|
|
|
|
if (simAuthor.isPresent()) {
|
|
|
|
|
double th = threshold;
|
|
|
|
|
// increase the threshold if the surname is too short
|
|
|
|
|
if (simAuthor.get()._2().getSurname() != null
|
|
|
|
|
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
|
|
|
|
th = 0.99;
|
|
|
|
|
|
|
|
|
|
if (simAuthor.get()._1() > th) {
|
|
|
|
|
Author r = simAuthor.get()._2();
|
|
|
|
|
if (r.getPid() == null) {
|
|
|
|
|
r.setPid(new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
if (simAuthor.get()._1() > th) {
|
|
|
|
|
Author r = simAuthor.get()._2();
|
|
|
|
|
if (r.getPid() == null) {
|
|
|
|
|
r.setPid(new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
|
|
|
|
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
|
|
|
|
// java.util.AbstractList.add
|
|
|
|
|
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
|
|
|
|
tmp.add(a._1());
|
|
|
|
|
r.setPid(tmp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
|
|
|
|
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
|
|
|
|
// java.util.AbstractList.add
|
|
|
|
|
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
|
|
|
|
tmp.add(a._1());
|
|
|
|
|
r.setPid(tmp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String normalizeFullName(final String fullname) {
|
|
|
|
|
return nfd(fullname)
|
|
|
|
|
.toLowerCase()
|
|
|
|
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
|
|
|
// in case
|
|
|
|
|
// of large input strings
|
|
|
|
|
.replaceAll("(\\W)+", " ")
|
|
|
|
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
|
|
|
.replaceAll("(\\p{Punct})+", " ")
|
|
|
|
|
.replaceAll("(\\d)+", " ")
|
|
|
|
|
.replaceAll("(\\n)+", " ")
|
|
|
|
|
public static String normalizeFullName(final String fullname) {
|
|
|
|
|
return nfd(fullname)
|
|
|
|
|
.toLowerCase()
|
|
|
|
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
|
|
|
// in case
|
|
|
|
|
// of large input strings
|
|
|
|
|
.replaceAll("(\\W)+", " ")
|
|
|
|
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
|
|
|
.replaceAll("(\\p{Punct})+", " ")
|
|
|
|
|
.replaceAll("(\\d)+", " ")
|
|
|
|
|
.replaceAll("(\\n)+", " ")
|
|
|
|
|
|
|
|
|
|
.trim();
|
|
|
|
|
}
|
|
|
|
|
.trim();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int hammingDist(String str1, String str2) {
|
|
|
|
|
if (str1.length() != str2.length())
|
|
|
|
|
return Math.max(str1.length(), str2.length());
|
|
|
|
|
int i = 0, count = 0;
|
|
|
|
|
while (i < str1.length()) {
|
|
|
|
|
if (str1.charAt(i) != str2.charAt(i))
|
|
|
|
|
count++;
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
static int hammingDist(String str1, String str2) {
|
|
|
|
|
if (str1.length() != str2.length())
|
|
|
|
|
return Math.max(str1.length(), str2.length());
|
|
|
|
|
int i = 0, count = 0;
|
|
|
|
|
while (i < str1.length()) {
|
|
|
|
|
if (str1.charAt(i) != str2.charAt(i))
|
|
|
|
|
count++;
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static String authorFieldToBeCompared(Author author) {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
|
|
|
return author.getSurname();
|
|
|
|
|
private static String authorFieldToBeCompared(Author author) {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
|
|
|
return author.getSurname();
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
if (StringUtils.isNotBlank(author.getFullname())) {
|
|
|
|
|
return author.getFullname();
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (StringUtils.isNotBlank(author.getFullname())) {
|
|
|
|
|
return author.getFullname();
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method tries to figure out when two author are the same in the contest
|
|
|
|
|
* of ORCID enrichment
|
|
|
|
|
* @param left Author in the OAF entity
|
|
|
|
|
* @param right Author ORCID
|
|
|
|
|
* @return based on a heuristic on the names of the authors if they are the same.
|
|
|
|
|
*/
|
|
|
|
|
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
|
|
|
|
final Person pl = parse(left);
|
|
|
|
|
final Person pr = parse(right);
|
|
|
|
|
/**
|
|
|
|
|
* This method tries to figure out when two author are the same in the contest
|
|
|
|
|
* of ORCID enrichment
|
|
|
|
|
*
|
|
|
|
|
* @param left Author in the OAF entity
|
|
|
|
|
* @param right Author ORCID
|
|
|
|
|
* @return based on a heuristic on the names of the authors if they are the same.
|
|
|
|
|
*/
|
|
|
|
|
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
|
|
|
|
final Person pl = parse(left);
|
|
|
|
|
final Person pr = parse(right);
|
|
|
|
|
|
|
|
|
|
// If one of them didn't have a surname we verify if they have the fullName not empty
|
|
|
|
|
// and verify if the normalized version is equal
|
|
|
|
|
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
|
|
|
|
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
|
|
|
|
// If one of them didn't have a surname we verify if they have the fullName not empty
|
|
|
|
|
// and verify if the normalized version is equal
|
|
|
|
|
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
|
|
|
|
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
|
|
|
|
|
|
|
|
|
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
|
|
|
|
&& !pr.getFullname().isEmpty()) {
|
|
|
|
|
return pl
|
|
|
|
|
.getFullname()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(
|
|
|
|
|
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// The Authors have one surname in common
|
|
|
|
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
|
|
|
|
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
|
|
|
|
&& !pr.getFullname().isEmpty()) {
|
|
|
|
|
return pl
|
|
|
|
|
.getFullname()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(
|
|
|
|
|
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// The Authors have one surname in common
|
|
|
|
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
|
|
|
|
|
|
|
|
|
// If one of them has only a surname and is the same we can say that they are the same author
|
|
|
|
|
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
|
|
|
|
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
|
|
|
|
return true;
|
|
|
|
|
// The authors have the same initials of Name in common
|
|
|
|
|
if (pl
|
|
|
|
|
.getName()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(
|
|
|
|
|
nl -> pr
|
|
|
|
|
.getName()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
// If one of them has only a surname and is the same we can say that they are the same author
|
|
|
|
|
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
|
|
|
|
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
|
|
|
|
return true;
|
|
|
|
|
// The authors have the same initials of Name in common
|
|
|
|
|
if (pl
|
|
|
|
|
.getName()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(
|
|
|
|
|
nl -> pr
|
|
|
|
|
.getName()
|
|
|
|
|
.stream()
|
|
|
|
|
.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
|
|
|
|
// We verify if we have an exact match between name and surname
|
|
|
|
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
|
|
|
|
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
//
|
|
|
|
|
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
|
|
|
|
// We verify if we have an exact match between name and surname
|
|
|
|
|
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
|
|
|
|
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
|
|
|
|
return true;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Method to enrich ORCID information in one list of authors based on another list
|
|
|
|
|
* @param baseAuthor the Author List in the OAF Entity
|
|
|
|
|
* @param orcidAuthor The list of ORCID Author intersected
|
|
|
|
|
* @return The Author List of the OAF Entity enriched with the orcid Author
|
|
|
|
|
*/
|
|
|
|
|
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
|
|
|
|
/**
|
|
|
|
|
* Method to enrich ORCID information in one list of authors based on another list
|
|
|
|
|
*
|
|
|
|
|
* @param baseAuthor the Author List in the OAF Entity
|
|
|
|
|
* @param orcidAuthor The list of ORCID Author intersected
|
|
|
|
|
* @return The Author List of the OAF Entity enriched with the orcid Author
|
|
|
|
|
*/
|
|
|
|
|
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
|
|
|
|
|
|
|
|
|
if (baseAuthor == null || baseAuthor.isEmpty())
|
|
|
|
|
return orcidAuthor;
|
|
|
|
|
if (baseAuthor == null || baseAuthor.isEmpty())
|
|
|
|
|
return orcidAuthor;
|
|
|
|
|
|
|
|
|
|
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
|
|
|
|
|
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
|
|
|
|
|
final List<Author> oAuthor = new ArrayList<>();
|
|
|
|
|
oAuthor.addAll(orcidAuthor);
|
|
|
|
|
final List<Author> oAuthor = new ArrayList<>();
|
|
|
|
|
oAuthor.addAll(orcidAuthor);
|
|
|
|
|
|
|
|
|
|
baseAuthor.forEach(ba -> {
|
|
|
|
|
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
|
|
|
|
if (aMatch.isPresent()) {
|
|
|
|
|
final Author sameAuthor = aMatch.get();
|
|
|
|
|
addPid(ba, sameAuthor.getPid());
|
|
|
|
|
oAuthor.remove(sameAuthor);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
}
|
|
|
|
|
baseAuthor.forEach(ba -> {
|
|
|
|
|
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
|
|
|
|
if (aMatch.isPresent()) {
|
|
|
|
|
final Author sameAuthor = aMatch.get();
|
|
|
|
|
addPid(ba, sameAuthor.getPid());
|
|
|
|
|
oAuthor.remove(sameAuthor);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
return baseAuthor;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
|
|
|
|
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
|
|
|
|
|
|
|
|
|
if (a.getPid() == null) {
|
|
|
|
|
a.setPid(new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
if (a.getPid() == null) {
|
|
|
|
|
a.setPid(new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
a.getPid().addAll(pids);
|
|
|
|
|
a.getPid().addAll(pids);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String pidToComparableString(StructuredProperty pid) {
|
|
|
|
|
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
|
|
|
|
: "";
|
|
|
|
|
return (pid.getQualifier() != null ? classid : "")
|
|
|
|
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
|
|
|
|
}
|
|
|
|
|
public static String pidToComparableString(StructuredProperty pid) {
|
|
|
|
|
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
|
|
|
|
: "";
|
|
|
|
|
return (pid.getQualifier() != null ? classid : "")
|
|
|
|
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static int countAuthorsPids(List<Author> authors) {
|
|
|
|
|
if (authors == null)
|
|
|
|
|
return 0;
|
|
|
|
|
public static int countAuthorsPids(List<Author> authors) {
|
|
|
|
|
if (authors == null)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return (int) authors.stream().filter(AuthorMerger::hasPid).count();
|
|
|
|
|
}
|
|
|
|
|
return (int) authors.stream().filter(AuthorMerger::hasPid).count();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static int authorsSize(List<Author> authors) {
|
|
|
|
|
if (authors == null)
|
|
|
|
|
return 0;
|
|
|
|
|
return authors.size();
|
|
|
|
|
}
|
|
|
|
|
private static int authorsSize(List<Author> authors) {
|
|
|
|
|
if (authors == null)
|
|
|
|
|
return 0;
|
|
|
|
|
return authors.size();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static Double sim(Author a, Author b) {
|
|
|
|
|
private static Double sim(Author a, Author b) {
|
|
|
|
|
|
|
|
|
|
final Person pa = parse(a);
|
|
|
|
|
final Person pb = parse(b);
|
|
|
|
|
final Person pa = parse(a);
|
|
|
|
|
final Person pb = parse(b);
|
|
|
|
|
|
|
|
|
|
// if both are accurate (e.g. they have name and surname)
|
|
|
|
|
if (pa.isAccurate() & pb.isAccurate()) {
|
|
|
|
|
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
|
|
|
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
|
|
|
|
} else {
|
|
|
|
|
return new JaroWinkler()
|
|
|
|
|
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// if both are accurate (e.g. they have name and surname)
|
|
|
|
|
if (pa.isAccurate() & pb.isAccurate()) {
|
|
|
|
|
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
|
|
|
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
|
|
|
|
} else {
|
|
|
|
|
return new JaroWinkler()
|
|
|
|
|
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static boolean hasPid(Author a) {
|
|
|
|
|
if (a == null || a.getPid() == null || a.getPid().isEmpty())
|
|
|
|
|
return false;
|
|
|
|
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
|
|
|
|
}
|
|
|
|
|
private static boolean hasPid(Author a) {
|
|
|
|
|
if (a == null || a.getPid() == null || a.getPid().isEmpty())
|
|
|
|
|
return false;
|
|
|
|
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static Person parse(Author author) {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
|
|
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
|
|
|
|
} else {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getFullname()))
|
|
|
|
|
return new Person(author.getFullname(), false);
|
|
|
|
|
else
|
|
|
|
|
return new Person("", false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
private static Person parse(Author author) {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
|
|
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
|
|
|
|
} else {
|
|
|
|
|
if (StringUtils.isNotBlank(author.getFullname()))
|
|
|
|
|
return new Person(author.getFullname(), false);
|
|
|
|
|
else
|
|
|
|
|
return new Person("", false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String normalize(final String s) {
|
|
|
|
|
String[] normalized = nfd(s)
|
|
|
|
|
.toLowerCase()
|
|
|
|
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
|
|
|
// in case
|
|
|
|
|
// of large input strings
|
|
|
|
|
.replaceAll("(\\W)+", " ")
|
|
|
|
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
|
|
|
.replaceAll("(\\p{Punct})+", " ")
|
|
|
|
|
.replaceAll("(\\d)+", " ")
|
|
|
|
|
.replaceAll("(\\n)+", " ")
|
|
|
|
|
.trim()
|
|
|
|
|
.split(" ");
|
|
|
|
|
public static String normalize(final String s) {
|
|
|
|
|
String[] normalized = nfd(s)
|
|
|
|
|
.toLowerCase()
|
|
|
|
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
|
|
|
// in case
|
|
|
|
|
// of large input strings
|
|
|
|
|
.replaceAll("(\\W)+", " ")
|
|
|
|
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
|
|
|
.replaceAll("(\\p{Punct})+", " ")
|
|
|
|
|
.replaceAll("(\\d)+", " ")
|
|
|
|
|
.replaceAll("(\\n)+", " ")
|
|
|
|
|
.trim()
|
|
|
|
|
.split(" ");
|
|
|
|
|
|
|
|
|
|
Arrays.sort(normalized);
|
|
|
|
|
Arrays.sort(normalized);
|
|
|
|
|
|
|
|
|
|
return String.join(" ", normalized);
|
|
|
|
|
}
|
|
|
|
|
return String.join(" ", normalized);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static String nfd(final String s) {
|
|
|
|
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
|
|
|
}
|
|
|
|
|
private static String nfd(final String s) {
|
|
|
|
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
Out of context this seems to account too much difference for strings of different length.
What about ensure to take as reference for while loop the shorter string, prefill count with the length difference and then add the char-by-char comparison difference?
That would be more permissive about strings that have the very same prefix.
About hammingDist function is never used, was a test of previous comparing function so I delete it