Added check for empty author list. If crossref is empty, the longest from all the merging providers is taken. If crossref is not empty, crossref is chosen as base for the enrichment

2021-07-08 15:27:05 +02:00 · 2021-07-08 15:27:05 +02:00 · 97e0c27db9
parent 3ed90420e4
commit 97e0c27db9
1 changed files with 129 additions and 97 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
@ -5,101 +5,159 @@ import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
-import org.apache.commons.lang3.StringUtils;
+import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.pace.model.Person;
+
 import scala.Tuple2;
 public class DoiBoostAuthorMerger {
 	private static final Double THRESHOLD = 0.95;
-	public static List<Author> merge(List<List<Author>> authors) {
+	public static List<Author> merge(List<List<Author>> authors,  Boolean crossref) {
 		Iterator<List<Author>> it = authors.iterator();
-		final List<Author> author = it.next();
+		List<Author> author = it.next();
-		it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD));
+		while (it.hasNext()){
 			List<Author> autList = it.next();
 			Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
 			author = tmp._1();
 			crossref = tmp._2();
 		}
 		return author;
 	}
-	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor,
+	public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor,
-		Double threshold) {
+										    final Boolean crossref) {
 		if(baseAuthor == null || baseAuthor.size() == 0)
 			return new Tuple2<>(otherAuthor, false);
 		if(otherAuthor == null || otherAuthor.size() == 0)
 			return new Tuple2<>(baseAuthor, crossref);
 		if(crossref) {
 			enrichPidFromList(baseAuthor, otherAuthor);
 			return new Tuple2<>(baseAuthor, true);
 		}
 		else
 			if (baseAuthor.size() > otherAuthor.size()){
 				enrichPidFromList(baseAuthor, otherAuthor);
 				return new Tuple2<>(baseAuthor, false);
 			}else{
 				enrichPidFromList(otherAuthor, baseAuthor);
 				return new Tuple2<>(otherAuthor, false);
 			}
 		enrichPidFromList(crossrefAuthor, otherAuthor, threshold);
 		return crossrefAuthor;
 	}
 	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor) {
 		return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD);
 	}
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
-		if (base == null || enrich == null)
+		if(base == null || enrich == null)
-			return;
+			return ;
-		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
+		//search authors having identifiers in the enrich list
-		final Map<String, Author> basePidAuthorMap = base
+        final List<Author> authorsWithPids = enrich
-			.stream()
+                .stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(
+                .collect(Collectors.toList());
 				a -> a
 					.getPid()
 					.stream()
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-		// <pid, Author> (list of pid that are missing in the other list)
+		Map<String, AuthorAssoc> assocMap = authorsWithPids
-		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+				.stream()
-			.stream()
+				.map(
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
+						a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
-			.flatMap(
+				.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
 				a -> a
 					.getPid()
 					.stream()
 					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
 					.map(p -> new Tuple2<>(p, a)))
 			.collect(Collectors.toList());
 		pidToEnrich
 			.forEach(
 				a -> {
 					Optional<Tuple2<Double, Author>> simAuthor = base
 						.stream()
 						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
 						.max(Comparator.comparing(Tuple2::_1));
-					if (simAuthor.isPresent()) {
+		//for each author in the base list, we search the best enriched match
-						double th = threshold;
+		base.stream()
-						// increase the threshold if the surname is too short
+				.map(a -> new Tuple2<>(a, authorsWithPids.stream()
-						if (simAuthor.get()._2().getSurname() != null
+						.map(e -> new Tuple2<>(e, sim(a, e))).collect(Collectors.toList())))
-							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
+                .forEach(t2 -> {
 							th = 0.99;
-						if (simAuthor.get()._1() > th) {
+                    for (Tuple2<Author, Double> t : t2._2()) {
-							Author r = simAuthor.get()._2();
+                    	String mapEntry = DHPUtils.md5(t._1().getFullname());
-							if (r.getPid() == null) {
+                    	AuthorAssoc aa = assocMap.get(mapEntry);
-								r.setPid(new ArrayList<>());
+                    	if(aa.getScore() < t._2()){
-							}
+							aa.setScore(t._2());
-
+							aa.setTo_be_enriched(new ArrayList<>());
-							// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
+							aa.getTo_be_enriched().add(t2._1());
-							// it creates of fixed size, and the add method raise UnsupportedOperationException at
+						}else if(aa.getScore() == t._2()){
-							// java.util.AbstractList.add
+                    		aa.getTo_be_enriched().add(t2._1());
 							final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
 							tmp.add(a._1());
 							r.setPid(tmp);
 						}
                    }
                });
 		assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
 	}
 	private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){
 		return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count();
 	}
 	private static void enrichAuthor(Author enrich, Author enriching){
 		//verify if some of the words in the fullname are contained in the other
 		//get normalized fullname
 		long commonWords = getCommonWords(normalize(enrich.getFullname()),
 				normalize(enriching.getFullname()));
 		if(commonWords > 0 ){
 			if(enrich.getPid() == null){
 				enrich.setPid(new ArrayList<>());
 			}
 				Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
 			enriching.getPid().forEach(p -> {
 					if (!aPids.contains(pidToComparableString(p))){
 						enrich.getPid().add(p);
 					}
 				});
 			if (enrich.getAffiliation() == null){
 				if (enriching.getAffiliation() != null){
 					enrich.setAffiliation(enriching.getAffiliation());
 				}
 			}
 		}
 	}
 	//Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we
 	//enrich no author
 	private static void enrichAuthor(AuthorAssoc authorAssoc) {
 		if (authorAssoc.getTo_be_enriched().size() == 1){
 			enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
 		}else{
 			long common = 0;
 			List<Author> selected = new ArrayList<>() ;
 			for(Author a : authorAssoc.getTo_be_enriched()){
 				long current_common = getCommonWords(normalize(a.getFullname()),
 						normalize(authorAssoc.getWith_enricheing_content().getFullname()));
 				if (current_common > common){
 					common = current_common;
 					selected = new ArrayList<>();
 					selected.add(a);
 				}else if(current_common == common){
 					selected.add(a);
 				}
 			}
 			if (selected.size() == 1){
 				enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
 			}
 		}
 	}
 	public static String pidToComparableString(StructuredProperty pid) {
 		return (pid.getQualifier() != null
 			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
@ -107,49 +165,21 @@ public class DoiBoostAuthorMerger {
 			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
 	}
 	public static int countAuthorsPids(List<Author> authors) {
 		if (authors == null)
 			return 0;
 		return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count();
 	}
 	private static int authorsSize(List<Author> authors) {
 		if (authors == null)
 			return 0;
 		return authors.size();
 	}
 	private static Double sim(Author a, Author b) {
 		final Person pa = parse(a);
 		final Person pb = parse(b);
 		// if both are accurate (e.g. they have name and surname)
 		if (pa.isAccurate() & pb.isAccurate()) {
 			return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
 				+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
 		} else {
 			return new JaroWinkler()
-				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
+				.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
-		}
+
 	}
-	private static boolean hasPid(Author a) {
+	private static String normalizeString(String fullname) {
-		if (a == null || a.getPid() == null || a.getPid().size() == 0)
+		return String.join(" ", normalize(fullname));
 			return false;
 		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
 	}
 	private static Person parse(Author author) {
 		if (StringUtils.isNotBlank(author.getSurname())) {
 			return new Person(author.getSurname() + ", " + author.getName(), false);
 		} else {
 			return new Person(author.getFullname(), false);
 		}
 	}
-	private static String normalize(final String s) {
+	private static List<String> normalize(final String s) {
 		String[] normalized = nfd(s)
 			.replaceAll("[^\\p{ASCII}]", "")
 			.toLowerCase()
@ -166,7 +196,9 @@ public class DoiBoostAuthorMerger {
 		Arrays.sort(normalized);
-		return String.join(" ", normalized);
+		return Arrays.asList(normalized);
 	}
 	private static String nfd(final String s) {