[DoiBoost Author merger] -

2021-11-22 16:54:27 +01:00 · 2021-11-22 16:54:27 +01:00 · 910abcba04
parent 41ea1b2177
commit 910abcba04
2 changed files with 246 additions and 255 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java
@ -1,18 +1,19 @@
 package eu.dnetlib.doiboost;
-import eu.dnetlib.dhp.schema.oaf.Author;
+package eu.dnetlib.doiboost;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.Author;
 /**
 * This class stores the association information between the enriching author and the possibly enriched ones.
 * It also contains the value of the similarity score between the enriching author and the possibly enriched ones.
 * Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list.
 */
 public class AuthorAssoc implements Serializable {
-    private Double score ;
+	private Double score;
 	private List<Author> to_be_enriched;
 	private Author with_enricheing_content;
@ -40,7 +41,7 @@ public class AuthorAssoc implements Serializable {
 		this.with_enricheing_content = with_enricheing_content;
 	}
-    public static AuthorAssoc newInstance(Author a){
+	public static AuthorAssoc newInstance(Author a) {
 		AuthorAssoc ret = new AuthorAssoc();
 		ret.score = 0.0;
 		ret.to_be_enriched = new ArrayList<>();
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
@ -6,14 +6,12 @@ import java.util.*;
 import java.util.stream.Collectors;
 import com.wcohen.ss.Jaccard;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-
+import eu.dnetlib.dhp.utils.DHPUtils;
 import scala.Tuple2;
 /**
@ -49,13 +47,12 @@ import scala.Tuple2;
 public class DoiBoostAuthorMerger {
 	public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
 		Iterator<List<Author>> it = authors.iterator();
 		List<Author> author = it.next();
-        while (it.hasNext()){
+		while (it.hasNext()) {
 			List<Author> autList = it.next();
 			Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
 			author = tmp._1();
@ -66,37 +63,35 @@ public class DoiBoostAuthorMerger {
 	}
-    //If we have a list of authors coming from crossref we take that and we enrich it
+	// If we have a list of authors coming from crossref we take that and we enrich it
-    //If we do not have a list of authors coming from crossref we enrich the longest at each step
+	// If we do not have a list of authors coming from crossref we enrich the longest at each step
-    public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor,
+	public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor,
 		final List<Author> otherAuthor,
 		final Boolean crossref) {
-        if(baseAuthor == null || baseAuthor.size() == 0)
+		if (baseAuthor == null || baseAuthor.size() == 0)
 			return new Tuple2<>(otherAuthor, false);
-        if(otherAuthor == null || otherAuthor.size() == 0)
+		if (otherAuthor == null || otherAuthor.size() == 0)
 			return new Tuple2<>(baseAuthor, crossref);
-        if(crossref) {
+		if (crossref) {
 			enrichPidFromList(baseAuthor, otherAuthor);
 			return new Tuple2<>(baseAuthor, true);
-        }
+		} else if (baseAuthor.size() > otherAuthor.size()) {
        else
        if (baseAuthor.size() > otherAuthor.size()){
 			enrichPidFromList(baseAuthor, otherAuthor);
 			return new Tuple2<>(baseAuthor, false);
-        }else{
+		} else {
 			enrichPidFromList(otherAuthor, baseAuthor);
 			return new Tuple2<>(otherAuthor, false);
 		}
 	}
-
+	// valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
-    //valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
+	// nella base list non il contrario
    //nella base list non il contrario
 	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
-        //search authors having identifiers in the enrich list
+		// search authors having identifiers in the enrich list
 		final List<Author> authorsWithPids = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -108,46 +103,46 @@ public class DoiBoostAuthorMerger {
 				a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-        Map<String, Tuple2<String,Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
+		Map<String, Tuple2<String, Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
-
+		// for each author in the base list, we search the best enriching match
-        //for each author in the base list, we search the best enriching match
+		// we create the association (author, list of (enriching author, similatiry score))
-        //we create the association (author, list of (enriching author, similatiry score))
+		base
-        base.stream()
+			.stream()
-                .map(a ->
+			.map(
-                        new Tuple2<>(a,
+				a -> new Tuple2<>(a,
-                                     authorsWithPids.stream()
+					authorsWithPids
 						.stream()
 						.map(e -> new Tuple2<>(e, sim(a, e)))
 						.filter(t2 -> t2._2() > 0.0)
-                                        .collect(Collectors.toList()))
+						.collect(Collectors.toList())))
                )
 			.forEach(t2 -> {
 				String base_name = t2._1().getFullname();
 				String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
 				Double max_score = 0.0;
 				List<String> enrich_name = new ArrayList();
 				for (Tuple2<Author, Double> t : t2._2()) {
-                        //we get the fullname of the enriching
+					// we get the fullname of the enriching
 					String mapEntry = DHPUtils.md5(t._1().getFullname());
-                        if(t._2() > max_score){
+					if (t._2() > max_score) {
 						max_score = t._2();
 						enrich_name = new ArrayList();
 						enrich_name.add(mapEntry);
-                        }else if(t._2() > 0 && t._2().equals(max_score)){
+					} else if (t._2() > 0 && t._2().equals(max_score)) {
 						enrich_name.add(mapEntry);
 					}
 					AuthorAssoc aa = assocMap.get(mapEntry);
-                        if(aa.getScore() < t._2()){
+					if (aa.getScore() < t._2()) {
 						aa.setScore(t._2());
 						aa.setTo_be_enriched(new ArrayList<>());
 						aa.getTo_be_enriched().add(t2._1());
-                        }else {
+					} else {
 						aa.getTo_be_enriched().add(t2._1());
 					}
 				}
-                    if(max_score > 0){
+				if (max_score > 0) {
 					baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
 				}
@ -158,105 +153,105 @@ public class DoiBoostAuthorMerger {
 		})
 			.collect(Collectors.toList());
 		list.sort(Comparator.comparing(e -> e._1()));
-        //ordino per max score la baseAssoc
+		// ordino per max score la baseAssoc
-        for (int i = list.size() -1 ; i>=0 ; i-- ){
+		for (int i = list.size() - 1; i >= 0; i--) {
 			Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
 			List<String> entries = tmp._2()._2();
-            //se len = 1 => ho un solo e che con questo a ha max score
+			// se len = 1 => ho un solo e che con questo a ha max score
-            if(entries.size() == 1){
+			if (entries.size() == 1) {
-                if(assocMap.containsKey(entries.get(0))) {
+				if (assocMap.containsKey(entries.get(0))) {
 					enrichAuthor(assocMap.get(entries.get(0)));
 					assocMap.remove(entries.get(0));
 				}
-            }else{
+			} else {
 				String author_fullname = tmp._2()._1();
 				long commonWords = 0;
 				String enriching = null;
-                for(String entry : entries){
+				for (String entry : entries) {
-                    if (assocMap.containsKey(entry)){
+					if (assocMap.containsKey(entry)) {
-                        long words = getCommonWords(normalize(entry),
+						long words = getCommonWords(
 							normalize(entry),
 							normalize(author_fullname));
-                        if (words > commonWords){
+						if (words > commonWords) {
 							commonWords = words;
 							enriching = entry;
 						}
-                        if(words == commonWords){
+						if (words == commonWords) {
 							enriching = null;
 						}
 					}
 				}
-                if(enriching != null){
+				if (enriching != null) {
 					enrichAuthor(assocMap.get(entries.get(0)));
 					assocMap.remove(entries.get(0));
 				}
-                //TODO pensare ad un modo per arricchire con il miglior e questo autore
+				// TODO pensare ad un modo per arricchire con il miglior e questo autore
-                //Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
+				// Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
 			}
 		}
 		// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
 	}
-    private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){
+	private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching) {
-        return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count();
+		return fullEnrich.stream().filter(w -> fullEnriching.contains(w)).count();
 	}
 	private static void enrichAuthor(Author enrich, Author enriching) {
 		// verify if some of the words in the fullname are contained in the other
 		// get normalized fullname
-    private static void enrichAuthor(Author enrich, Author enriching){
+		long commonWords = getCommonWords(
-        //verify if some of the words in the fullname are contained in the other
+			normalize(enrich.getFullname()),
        //get normalized fullname
        long commonWords = getCommonWords(normalize(enrich.getFullname()),
 			normalize(enriching.getFullname()));
-        if(commonWords > 0 ){
+		if (commonWords > 0) {
-            if(enrich.getPid() == null){
+			if (enrich.getPid() == null) {
 				enrich.setPid(new ArrayList<>());
 			}
 			Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
 			enriching.getPid().forEach(p -> {
-                if (!aPids.contains(pidToComparableString(p))){
+				if (!aPids.contains(pidToComparableString(p))) {
 					enrich.getPid().add(p);
 				}
 			});
-            if (enrich.getAffiliation() == null){
+			if (enrich.getAffiliation() == null) {
-                if (enriching.getAffiliation() != null){
+				if (enriching.getAffiliation() != null) {
 					enrich.setAffiliation(enriching.getAffiliation());
 				}
 			}
 		}
 	}
-    //Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we
+	// Verify the number of words in common. The one that has more, wins. If the number of words in common are the same
-    //enrich no author
+	// we
 	// enrich no author
 	private static void enrichAuthor(AuthorAssoc authorAssoc) {
-        if (authorAssoc.getTo_be_enriched().size() == 1){
+		if (authorAssoc.getTo_be_enriched().size() == 1) {
 			enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
-        }else{
+		} else {
 			long common = 0;
-            List<Author> selected = new ArrayList<>() ;
+			List<Author> selected = new ArrayList<>();
-            for(Author a : authorAssoc.getTo_be_enriched()){
+			for (Author a : authorAssoc.getTo_be_enriched()) {
-                long current_common = getCommonWords(normalize(a.getFullname()),
+				long current_common = getCommonWords(
 					normalize(a.getFullname()),
 					normalize(authorAssoc.getWith_enricheing_content().getFullname()));
-                if (current_common > common){
+				if (current_common > common) {
 					common = current_common;
 					selected = new ArrayList<>();
 					selected.add(a);
-                }else if(current_common == common){
+				} else if (current_common == common) {
 					selected.add(a);
 				}
 			}
-            if (selected.size() == 1){
+			if (selected.size() == 1) {
 				enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
 			}
 		}
 	}
 	public static String pidToComparableString(StructuredProperty pid) {
 		return (pid.getQualifier() != null
 			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
@ -264,9 +259,6 @@ public class DoiBoostAuthorMerger {
 			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
 	}
 	private static Double sim(Author a, Author b) {
 		return new Jaccard()
 			.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
@ -277,7 +269,6 @@ public class DoiBoostAuthorMerger {
 		return String.join(" ", normalize(fullname));
 	}
 	private static List<String> normalize(final String s) {
 		String[] normalized = nfd(s)
 			.replaceAll("[^\\p{ASCII}]", "")
@ -297,7 +288,6 @@ public class DoiBoostAuthorMerger {
 		return Arrays.asList(normalized);
 	}
 	private static String nfd(final String s) {