forked from antonis.lempesis/dnet-hadoop
Added check for empty author list. If crossref is empty, the longest from all the merging providers is taken. If crossref is not empty, crossref is chosen as base for the enrichment
This commit is contained in:
parent
3ed90420e4
commit
97e0c27db9
|
@ -5,101 +5,159 @@ import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.pace.model.Person;
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class DoiBoostAuthorMerger {
|
public class DoiBoostAuthorMerger {
|
||||||
|
|
||||||
private static final Double THRESHOLD = 0.95;
|
|
||||||
|
|
||||||
public static List<Author> merge(List<List<Author>> authors) {
|
public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
|
||||||
|
|
||||||
Iterator<List<Author>> it = authors.iterator();
|
Iterator<List<Author>> it = authors.iterator();
|
||||||
final List<Author> author = it.next();
|
List<Author> author = it.next();
|
||||||
|
|
||||||
it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD));
|
while (it.hasNext()){
|
||||||
|
List<Author> autList = it.next();
|
||||||
|
Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
|
||||||
|
author = tmp._1();
|
||||||
|
crossref = tmp._2();
|
||||||
|
}
|
||||||
|
|
||||||
return author;
|
return author;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor,
|
public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor,
|
||||||
Double threshold) {
|
final Boolean crossref) {
|
||||||
|
|
||||||
|
if(baseAuthor == null || baseAuthor.size() == 0)
|
||||||
|
return new Tuple2<>(otherAuthor, false);
|
||||||
|
if(otherAuthor == null || otherAuthor.size() == 0)
|
||||||
|
return new Tuple2<>(baseAuthor, crossref);
|
||||||
|
|
||||||
|
if(crossref) {
|
||||||
|
enrichPidFromList(baseAuthor, otherAuthor);
|
||||||
|
return new Tuple2<>(baseAuthor, true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if (baseAuthor.size() > otherAuthor.size()){
|
||||||
|
enrichPidFromList(baseAuthor, otherAuthor);
|
||||||
|
return new Tuple2<>(baseAuthor, false);
|
||||||
|
}else{
|
||||||
|
enrichPidFromList(otherAuthor, baseAuthor);
|
||||||
|
return new Tuple2<>(otherAuthor, false);
|
||||||
|
}
|
||||||
|
|
||||||
enrichPidFromList(crossrefAuthor, otherAuthor, threshold);
|
|
||||||
return crossrefAuthor;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor) {
|
|
||||||
return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
if (base == null || enrich == null)
|
if(base == null || enrich == null)
|
||||||
return;
|
return ;
|
||||||
|
|
||||||
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
//search authors having identifiers in the enrich list
|
||||||
final Map<String, Author> basePidAuthorMap = base
|
final List<Author> authorsWithPids = enrich
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
.flatMap(
|
.collect(Collectors.toList());
|
||||||
a -> a
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
|
||||||
|
|
||||||
// <pid, Author> (list of pid that are missing in the other list)
|
Map<String, AuthorAssoc> assocMap = authorsWithPids
|
||||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
.stream()
|
||||||
.stream()
|
.map(
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
|
||||||
.flatMap(
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
a -> a
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
|
|
||||||
.map(p -> new Tuple2<>(p, a)))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
pidToEnrich
|
|
||||||
.forEach(
|
|
||||||
a -> {
|
|
||||||
Optional<Tuple2<Double, Author>> simAuthor = base
|
|
||||||
.stream()
|
|
||||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
|
||||||
|
|
||||||
if (simAuthor.isPresent()) {
|
//for each author in the base list, we search the best enriched match
|
||||||
double th = threshold;
|
base.stream()
|
||||||
// increase the threshold if the surname is too short
|
.map(a -> new Tuple2<>(a, authorsWithPids.stream()
|
||||||
if (simAuthor.get()._2().getSurname() != null
|
.map(e -> new Tuple2<>(e, sim(a, e))).collect(Collectors.toList())))
|
||||||
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
.forEach(t2 -> {
|
||||||
th = 0.99;
|
|
||||||
|
|
||||||
if (simAuthor.get()._1() > th) {
|
for (Tuple2<Author, Double> t : t2._2()) {
|
||||||
Author r = simAuthor.get()._2();
|
String mapEntry = DHPUtils.md5(t._1().getFullname());
|
||||||
if (r.getPid() == null) {
|
AuthorAssoc aa = assocMap.get(mapEntry);
|
||||||
r.setPid(new ArrayList<>());
|
if(aa.getScore() < t._2()){
|
||||||
}
|
aa.setScore(t._2());
|
||||||
|
aa.setTo_be_enriched(new ArrayList<>());
|
||||||
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
aa.getTo_be_enriched().add(t2._1());
|
||||||
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
}else if(aa.getScore() == t._2()){
|
||||||
// java.util.AbstractList.add
|
aa.getTo_be_enriched().add(t2._1());
|
||||||
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
|
||||||
tmp.add(a._1());
|
|
||||||
r.setPid(tmp);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){
|
||||||
|
return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void enrichAuthor(Author enrich, Author enriching){
|
||||||
|
//verify if some of the words in the fullname are contained in the other
|
||||||
|
//get normalized fullname
|
||||||
|
|
||||||
|
long commonWords = getCommonWords(normalize(enrich.getFullname()),
|
||||||
|
normalize(enriching.getFullname()));
|
||||||
|
if(commonWords > 0 ){
|
||||||
|
if(enrich.getPid() == null){
|
||||||
|
enrich.setPid(new ArrayList<>());
|
||||||
|
}
|
||||||
|
Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
|
||||||
|
enriching.getPid().forEach(p -> {
|
||||||
|
if (!aPids.contains(pidToComparableString(p))){
|
||||||
|
enrich.getPid().add(p);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
if (enrich.getAffiliation() == null){
|
||||||
|
if (enriching.getAffiliation() != null){
|
||||||
|
enrich.setAffiliation(enriching.getAffiliation());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we
|
||||||
|
//enrich no author
|
||||||
|
private static void enrichAuthor(AuthorAssoc authorAssoc) {
|
||||||
|
if (authorAssoc.getTo_be_enriched().size() == 1){
|
||||||
|
enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
|
||||||
|
}else{
|
||||||
|
long common = 0;
|
||||||
|
List<Author> selected = new ArrayList<>() ;
|
||||||
|
for(Author a : authorAssoc.getTo_be_enriched()){
|
||||||
|
long current_common = getCommonWords(normalize(a.getFullname()),
|
||||||
|
normalize(authorAssoc.getWith_enricheing_content().getFullname()));
|
||||||
|
if (current_common > common){
|
||||||
|
common = current_common;
|
||||||
|
selected = new ArrayList<>();
|
||||||
|
selected.add(a);
|
||||||
|
}else if(current_common == common){
|
||||||
|
selected.add(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (selected.size() == 1){
|
||||||
|
enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
return (pid.getQualifier() != null
|
return (pid.getQualifier() != null
|
||||||
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
||||||
|
@ -107,49 +165,21 @@ public class DoiBoostAuthorMerger {
|
||||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int countAuthorsPids(List<Author> authors) {
|
|
||||||
if (authors == null)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int authorsSize(List<Author> authors) {
|
|
||||||
if (authors == null)
|
|
||||||
return 0;
|
|
||||||
return authors.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double sim(Author a, Author b) {
|
private static Double sim(Author a, Author b) {
|
||||||
|
|
||||||
final Person pa = parse(a);
|
|
||||||
final Person pb = parse(b);
|
|
||||||
|
|
||||||
// if both are accurate (e.g. they have name and surname)
|
|
||||||
if (pa.isAccurate() & pb.isAccurate()) {
|
|
||||||
return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
|
|
||||||
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
|
|
||||||
} else {
|
|
||||||
return new JaroWinkler()
|
return new JaroWinkler()
|
||||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasPid(Author a) {
|
private static String normalizeString(String fullname) {
|
||||||
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
return String.join(" ", normalize(fullname));
|
||||||
return false;
|
|
||||||
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Person parse(Author author) {
|
|
||||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
|
||||||
return new Person(author.getSurname() + ", " + author.getName(), false);
|
|
||||||
} else {
|
|
||||||
return new Person(author.getFullname(), false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
private static List<String> normalize(final String s) {
|
||||||
String[] normalized = nfd(s)
|
String[] normalized = nfd(s)
|
||||||
.replaceAll("[^\\p{ASCII}]", "")
|
.replaceAll("[^\\p{ASCII}]", "")
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
|
@ -166,7 +196,9 @@ public class DoiBoostAuthorMerger {
|
||||||
|
|
||||||
Arrays.sort(normalized);
|
Arrays.sort(normalized);
|
||||||
|
|
||||||
return String.join(" ", normalized);
|
return Arrays.asList(normalized);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
private static String nfd(final String s) {
|
||||||
|
|
Loading…
Reference in New Issue