[DoiBoost Author merger] -

This commit is contained in:
Miriam Baglioni 2021-11-22 16:54:27 +01:00
parent 41ea1b2177
commit 910abcba04
2 changed files with 246 additions and 255 deletions

View File

@ -1,18 +1,19 @@
package eu.dnetlib.doiboost;
import eu.dnetlib.dhp.schema.oaf.Author; package eu.dnetlib.doiboost;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.oaf.Author;
/** /**
* This class stores the association information between the enriching author and the possibly enriched ones. * This class stores the association information between the enriching author and the possibly enriched ones.
* It also contains the value of the similarity score between the enriching author and the possibly enriched ones. * It also contains the value of the similarity score between the enriching author and the possibly enriched ones.
* Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list. * Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list.
*/ */
public class AuthorAssoc implements Serializable { public class AuthorAssoc implements Serializable {
private Double score ; private Double score;
private List<Author> to_be_enriched; private List<Author> to_be_enriched;
private Author with_enricheing_content; private Author with_enricheing_content;
@ -40,7 +41,7 @@ public class AuthorAssoc implements Serializable {
this.with_enricheing_content = with_enricheing_content; this.with_enricheing_content = with_enricheing_content;
} }
public static AuthorAssoc newInstance(Author a){ public static AuthorAssoc newInstance(Author a) {
AuthorAssoc ret = new AuthorAssoc(); AuthorAssoc ret = new AuthorAssoc();
ret.score = 0.0; ret.score = 0.0;
ret.to_be_enriched = new ArrayList<>(); ret.to_be_enriched = new ArrayList<>();

View File

@ -6,14 +6,12 @@ import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.wcohen.ss.Jaccard; import com.wcohen.ss.Jaccard;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.DHPUtils;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
/** /**
@ -49,13 +47,12 @@ import scala.Tuple2;
public class DoiBoostAuthorMerger { public class DoiBoostAuthorMerger {
public static List<Author> merge(List<List<Author>> authors, Boolean crossref) { public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
Iterator<List<Author>> it = authors.iterator(); Iterator<List<Author>> it = authors.iterator();
List<Author> author = it.next(); List<Author> author = it.next();
while (it.hasNext()){ while (it.hasNext()) {
List<Author> autList = it.next(); List<Author> autList = it.next();
Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref); Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
author = tmp._1(); author = tmp._1();
@ -66,37 +63,35 @@ public class DoiBoostAuthorMerger {
} }
//If we have a list of authors coming from crossref we take that and we enrich it // If we have a list of authors coming from crossref we take that and we enrich it
//If we do not have a list of authors coming from crossref we enrich the longest at each step // If we do not have a list of authors coming from crossref we enrich the longest at each step
public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor, public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor,
final List<Author> otherAuthor,
final Boolean crossref) { final Boolean crossref) {
if(baseAuthor == null || baseAuthor.size() == 0) if (baseAuthor == null || baseAuthor.size() == 0)
return new Tuple2<>(otherAuthor, false); return new Tuple2<>(otherAuthor, false);
if(otherAuthor == null || otherAuthor.size() == 0) if (otherAuthor == null || otherAuthor.size() == 0)
return new Tuple2<>(baseAuthor, crossref); return new Tuple2<>(baseAuthor, crossref);
if(crossref) { if (crossref) {
enrichPidFromList(baseAuthor, otherAuthor); enrichPidFromList(baseAuthor, otherAuthor);
return new Tuple2<>(baseAuthor, true); return new Tuple2<>(baseAuthor, true);
} } else if (baseAuthor.size() > otherAuthor.size()) {
else
if (baseAuthor.size() > otherAuthor.size()){
enrichPidFromList(baseAuthor, otherAuthor); enrichPidFromList(baseAuthor, otherAuthor);
return new Tuple2<>(baseAuthor, false); return new Tuple2<>(baseAuthor, false);
}else{ } else {
enrichPidFromList(otherAuthor, baseAuthor); enrichPidFromList(otherAuthor, baseAuthor);
return new Tuple2<>(otherAuthor, false); return new Tuple2<>(otherAuthor, false);
} }
} }
// valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
//valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia // nella base list non il contrario
//nella base list non il contrario
private static void enrichPidFromList(List<Author> base, List<Author> enrich) { private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
//search authors having identifiers in the enrich list // search authors having identifiers in the enrich list
final List<Author> authorsWithPids = enrich final List<Author> authorsWithPids = enrich
.stream() .stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0) .filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -108,46 +103,46 @@ public class DoiBoostAuthorMerger {
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a))) a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
Map<String, Tuple2<String,Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>(); Map<String, Tuple2<String, Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
// for each author in the base list, we search the best enriching match
//for each author in the base list, we search the best enriching match // we create the association (author, list of (enriching author, similatiry score))
//we create the association (author, list of (enriching author, similatiry score)) base
base.stream() .stream()
.map(a -> .map(
new Tuple2<>(a, a -> new Tuple2<>(a,
authorsWithPids.stream() authorsWithPids
.stream()
.map(e -> new Tuple2<>(e, sim(a, e))) .map(e -> new Tuple2<>(e, sim(a, e)))
.filter(t2 -> t2._2() > 0.0) .filter(t2 -> t2._2() > 0.0)
.collect(Collectors.toList())) .collect(Collectors.toList())))
)
.forEach(t2 -> { .forEach(t2 -> {
String base_name = t2._1().getFullname(); String base_name = t2._1().getFullname();
String base_name_md5 = DHPUtils.md5(t2._1().getFullname()); String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
Double max_score = 0.0; Double max_score = 0.0;
List<String> enrich_name = new ArrayList(); List<String> enrich_name = new ArrayList();
for (Tuple2<Author, Double> t : t2._2()) { for (Tuple2<Author, Double> t : t2._2()) {
//we get the fullname of the enriching // we get the fullname of the enriching
String mapEntry = DHPUtils.md5(t._1().getFullname()); String mapEntry = DHPUtils.md5(t._1().getFullname());
if(t._2() > max_score){ if (t._2() > max_score) {
max_score = t._2(); max_score = t._2();
enrich_name = new ArrayList(); enrich_name = new ArrayList();
enrich_name.add(mapEntry); enrich_name.add(mapEntry);
}else if(t._2() > 0 && t._2().equals(max_score)){ } else if (t._2() > 0 && t._2().equals(max_score)) {
enrich_name.add(mapEntry); enrich_name.add(mapEntry);
} }
AuthorAssoc aa = assocMap.get(mapEntry); AuthorAssoc aa = assocMap.get(mapEntry);
if(aa.getScore() < t._2()){ if (aa.getScore() < t._2()) {
aa.setScore(t._2()); aa.setScore(t._2());
aa.setTo_be_enriched(new ArrayList<>()); aa.setTo_be_enriched(new ArrayList<>());
aa.getTo_be_enriched().add(t2._1()); aa.getTo_be_enriched().add(t2._1());
}else { } else {
aa.getTo_be_enriched().add(t2._1()); aa.getTo_be_enriched().add(t2._1());
} }
} }
if(max_score > 0){ if (max_score > 0) {
baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score))); baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
} }
@ -158,105 +153,105 @@ public class DoiBoostAuthorMerger {
}) })
.collect(Collectors.toList()); .collect(Collectors.toList());
list.sort(Comparator.comparing(e -> e._1())); list.sort(Comparator.comparing(e -> e._1()));
//ordino per max score la baseAssoc // ordino per max score la baseAssoc
for (int i = list.size() -1 ; i>=0 ; i-- ){ for (int i = list.size() - 1; i >= 0; i--) {
Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i); Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
List<String> entries = tmp._2()._2(); List<String> entries = tmp._2()._2();
//se len = 1 => ho un solo e che con questo a ha max score // se len = 1 => ho un solo e che con questo a ha max score
if(entries.size() == 1){ if (entries.size() == 1) {
if(assocMap.containsKey(entries.get(0))) { if (assocMap.containsKey(entries.get(0))) {
enrichAuthor(assocMap.get(entries.get(0))); enrichAuthor(assocMap.get(entries.get(0)));
assocMap.remove(entries.get(0)); assocMap.remove(entries.get(0));
} }
}else{ } else {
String author_fullname = tmp._2()._1(); String author_fullname = tmp._2()._1();
long commonWords = 0; long commonWords = 0;
String enriching = null; String enriching = null;
for(String entry : entries){ for (String entry : entries) {
if (assocMap.containsKey(entry)){ if (assocMap.containsKey(entry)) {
long words = getCommonWords(normalize(entry), long words = getCommonWords(
normalize(entry),
normalize(author_fullname)); normalize(author_fullname));
if (words > commonWords){ if (words > commonWords) {
commonWords = words; commonWords = words;
enriching = entry; enriching = entry;
} }
if(words == commonWords){ if (words == commonWords) {
enriching = null; enriching = null;
} }
} }
} }
if(enriching != null){ if (enriching != null) {
enrichAuthor(assocMap.get(entries.get(0))); enrichAuthor(assocMap.get(entries.get(0)));
assocMap.remove(entries.get(0)); assocMap.remove(entries.get(0));
} }
//TODO pensare ad un modo per arricchire con il miglior e questo autore // TODO pensare ad un modo per arricchire con il miglior e questo autore
//Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score // Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
} }
} }
// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k))); // assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
} }
private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){ private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching) {
return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count(); return fullEnrich.stream().filter(w -> fullEnriching.contains(w)).count();
} }
private static void enrichAuthor(Author enrich, Author enriching) {
// verify if some of the words in the fullname are contained in the other
// get normalized fullname
private static void enrichAuthor(Author enrich, Author enriching){ long commonWords = getCommonWords(
//verify if some of the words in the fullname are contained in the other normalize(enrich.getFullname()),
//get normalized fullname
long commonWords = getCommonWords(normalize(enrich.getFullname()),
normalize(enriching.getFullname())); normalize(enriching.getFullname()));
if(commonWords > 0 ){ if (commonWords > 0) {
if(enrich.getPid() == null){ if (enrich.getPid() == null) {
enrich.setPid(new ArrayList<>()); enrich.setPid(new ArrayList<>());
} }
Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet()); Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
enriching.getPid().forEach(p -> { enriching.getPid().forEach(p -> {
if (!aPids.contains(pidToComparableString(p))){ if (!aPids.contains(pidToComparableString(p))) {
enrich.getPid().add(p); enrich.getPid().add(p);
} }
}); });
if (enrich.getAffiliation() == null){ if (enrich.getAffiliation() == null) {
if (enriching.getAffiliation() != null){ if (enriching.getAffiliation() != null) {
enrich.setAffiliation(enriching.getAffiliation()); enrich.setAffiliation(enriching.getAffiliation());
} }
} }
} }
} }
//Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we // Verify the number of words in common. The one that has more, wins. If the number of words in common are the same
//enrich no author // we
// enrich no author
private static void enrichAuthor(AuthorAssoc authorAssoc) { private static void enrichAuthor(AuthorAssoc authorAssoc) {
if (authorAssoc.getTo_be_enriched().size() == 1){ if (authorAssoc.getTo_be_enriched().size() == 1) {
enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content()); enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
}else{ } else {
long common = 0; long common = 0;
List<Author> selected = new ArrayList<>() ; List<Author> selected = new ArrayList<>();
for(Author a : authorAssoc.getTo_be_enriched()){ for (Author a : authorAssoc.getTo_be_enriched()) {
long current_common = getCommonWords(normalize(a.getFullname()), long current_common = getCommonWords(
normalize(a.getFullname()),
normalize(authorAssoc.getWith_enricheing_content().getFullname())); normalize(authorAssoc.getWith_enricheing_content().getFullname()));
if (current_common > common){ if (current_common > common) {
common = current_common; common = current_common;
selected = new ArrayList<>(); selected = new ArrayList<>();
selected.add(a); selected.add(a);
}else if(current_common == common){ } else if (current_common == common) {
selected.add(a); selected.add(a);
} }
} }
if (selected.size() == 1){ if (selected.size() == 1) {
enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content()); enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
} }
} }
} }
public static String pidToComparableString(StructuredProperty pid) { public static String pidToComparableString(StructuredProperty pid) {
return (pid.getQualifier() != null return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
@ -264,9 +259,6 @@ public class DoiBoostAuthorMerger {
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
} }
private static Double sim(Author a, Author b) { private static Double sim(Author a, Author b) {
return new Jaccard() return new Jaccard()
.score(normalizeString(a.getFullname()), normalizeString(b.getFullname())); .score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
@ -277,7 +269,6 @@ public class DoiBoostAuthorMerger {
return String.join(" ", normalize(fullname)); return String.join(" ", normalize(fullname));
} }
private static List<String> normalize(final String s) { private static List<String> normalize(final String s) {
String[] normalized = nfd(s) String[] normalized = nfd(s)
.replaceAll("[^\\p{ASCII}]", "") .replaceAll("[^\\p{ASCII}]", "")
@ -297,7 +288,6 @@ public class DoiBoostAuthorMerger {
return Arrays.asList(normalized); return Arrays.asList(normalized);
} }
private static String nfd(final String s) { private static String nfd(final String s) {