[DoiBoost Author merger] -

2021-11-22 16:54:27 +01:00 · 2021-11-22 16:54:27 +01:00 · 910abcba04
parent 41ea1b2177
commit 910abcba04
2 changed files with 246 additions and 255 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java
@ -1,52 +1,53 @@
 package eu.dnetlib.doiboost;
-import eu.dnetlib.dhp.schema.oaf.Author;
+package eu.dnetlib.doiboost;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.Author;
 /**
 * This class stores the association information between the enriching author and the possibly enriched ones.
 * It also contains the value of the similarity score between the enriching author and the possibly enriched ones.
 * Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list.
 */
 public class AuthorAssoc implements Serializable {
-    private Double score ;
+	private Double score;
-    private List<Author> to_be_enriched;
+	private List<Author> to_be_enriched;
-    private Author with_enricheing_content;
+	private Author with_enricheing_content;
-    public Double getScore() {
+	public Double getScore() {
-        return score;
+		return score;
-    }
+	}
-    public void setScore(Double score) {
+	public void setScore(Double score) {
-        this.score = score;
+		this.score = score;
-    }
+	}
-    public List<Author> getTo_be_enriched() {
+	public List<Author> getTo_be_enriched() {
-        return to_be_enriched;
+		return to_be_enriched;
-    }
+	}
-    public void setTo_be_enriched(List<Author> to_be_enriched) {
+	public void setTo_be_enriched(List<Author> to_be_enriched) {
-        this.to_be_enriched = to_be_enriched;
+		this.to_be_enriched = to_be_enriched;
-    }
+	}
-    public Author getWith_enricheing_content() {
+	public Author getWith_enricheing_content() {
-        return with_enricheing_content;
+		return with_enricheing_content;
-    }
+	}
-    public void setWith_enricheing_content(Author with_enricheing_content) {
+	public void setWith_enricheing_content(Author with_enricheing_content) {
-        this.with_enricheing_content = with_enricheing_content;
+		this.with_enricheing_content = with_enricheing_content;
-    }
+	}
-    public static AuthorAssoc newInstance(Author a){
+	public static AuthorAssoc newInstance(Author a) {
-        AuthorAssoc ret = new AuthorAssoc();
+		AuthorAssoc ret = new AuthorAssoc();
-        ret.score = 0.0;
+		ret.score = 0.0;
-        ret.to_be_enriched = new ArrayList<>();
+		ret.to_be_enriched = new ArrayList<>();
-        ret.with_enricheing_content = a;
+		ret.with_enricheing_content = a;
-        return ret;
+		return ret;
-    }
+	}
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
@ -6,14 +6,12 @@ import java.util.*;
 import java.util.stream.Collectors;
 import com.wcohen.ss.Jaccard;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-
+import eu.dnetlib.dhp.utils.DHPUtils;
 import scala.Tuple2;
 /**
@ -49,258 +47,250 @@ import scala.Tuple2;
 public class DoiBoostAuthorMerger {
 	public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
-    public static List<Author> merge(List<List<Author>> authors,  Boolean crossref) {
+		Iterator<List<Author>> it = authors.iterator();
 		List<Author> author = it.next();
-        Iterator<List<Author>> it = authors.iterator();
+		while (it.hasNext()) {
-        List<Author> author = it.next();
+			List<Author> autList = it.next();
 			Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
 			author = tmp._1();
 			crossref = tmp._2();
 		}
-        while (it.hasNext()){
+		return author;
            List<Author> autList = it.next();
            Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
            author = tmp._1();
            crossref = tmp._2();
        }
-        return author;
+	}
-    }
+	// If we have a list of authors coming from crossref we take that and we enrich it
 	// If we do not have a list of authors coming from crossref we enrich the longest at each step
 	public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor,
 		final List<Author> otherAuthor,
 		final Boolean crossref) {
-    //If we have a list of authors coming from crossref we take that and we enrich it
+		if (baseAuthor == null || baseAuthor.size() == 0)
-    //If we do not have a list of authors coming from crossref we enrich the longest at each step
+			return new Tuple2<>(otherAuthor, false);
-    public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor,
+		if (otherAuthor == null || otherAuthor.size() == 0)
-                                                            final Boolean crossref) {
+			return new Tuple2<>(baseAuthor, crossref);
-        if(baseAuthor == null || baseAuthor.size() == 0)
+		if (crossref) {
-            return new Tuple2<>(otherAuthor, false);
+			enrichPidFromList(baseAuthor, otherAuthor);
-        if(otherAuthor == null || otherAuthor.size() == 0)
+			return new Tuple2<>(baseAuthor, true);
-            return new Tuple2<>(baseAuthor, crossref);
+		} else if (baseAuthor.size() > otherAuthor.size()) {
 			enrichPidFromList(baseAuthor, otherAuthor);
 			return new Tuple2<>(baseAuthor, false);
 		} else {
 			enrichPidFromList(otherAuthor, baseAuthor);
 			return new Tuple2<>(otherAuthor, false);
 		}
-        if(crossref) {
+	}
            enrichPidFromList(baseAuthor, otherAuthor);
            return new Tuple2<>(baseAuthor, true);
        }
        else
        if (baseAuthor.size() > otherAuthor.size()){
            enrichPidFromList(baseAuthor, otherAuthor);
            return new Tuple2<>(baseAuthor, false);
        }else{
            enrichPidFromList(otherAuthor, baseAuthor);
            return new Tuple2<>(otherAuthor, false);
        }
-    }
+	// valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
 	// nella base list non il contrario
 	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
 		// search authors having identifiers in the enrich list
 		final List<Author> authorsWithPids = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
 			.collect(Collectors.toList());
-    //valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
+		Map<String, AuthorAssoc> assocMap = authorsWithPids
-    //nella base list non il contrario
+			.stream()
-    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+			.map(
 				a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-        //search authors having identifiers in the enrich list
+		Map<String, Tuple2<String, Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
        final List<Author> authorsWithPids = enrich
                .stream()
                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
                .collect(Collectors.toList());
-        Map<String, AuthorAssoc> assocMap = authorsWithPids
+		// for each author in the base list, we search the best enriching match
-                .stream()
+		// we create the association (author, list of (enriching author, similatiry score))
-                .map(
+		base
-                        a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
+			.stream()
-                .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+			.map(
 				a -> new Tuple2<>(a,
 					authorsWithPids
 						.stream()
 						.map(e -> new Tuple2<>(e, sim(a, e)))
 						.filter(t2 -> t2._2() > 0.0)
 						.collect(Collectors.toList())))
 			.forEach(t2 -> {
 				String base_name = t2._1().getFullname();
 				String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
 				Double max_score = 0.0;
 				List<String> enrich_name = new ArrayList();
 				for (Tuple2<Author, Double> t : t2._2()) {
 					// we get the fullname of the enriching
 					String mapEntry = DHPUtils.md5(t._1().getFullname());
-        Map<String, Tuple2<String,Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
+					if (t._2() > max_score) {
 						max_score = t._2();
 						enrich_name = new ArrayList();
 						enrich_name.add(mapEntry);
 					} else if (t._2() > 0 && t._2().equals(max_score)) {
 						enrich_name.add(mapEntry);
 					}
 					AuthorAssoc aa = assocMap.get(mapEntry);
 					if (aa.getScore() < t._2()) {
 						aa.setScore(t._2());
 						aa.setTo_be_enriched(new ArrayList<>());
 						aa.getTo_be_enriched().add(t2._1());
 					} else {
 						aa.getTo_be_enriched().add(t2._1());
 					}
 				}
 				if (max_score > 0) {
 					baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
 				}
-        //for each author in the base list, we search the best enriching match
+			});
-        //we create the association (author, list of (enriching author, similatiry score))
+		List<Tuple2<Double, Tuple2<String, List<String>>>> list = baseAssoc.keySet().stream().map(k -> {
-        base.stream()
+			Tuple2<String, Tuple2<List<String>, Double>> map_entry = baseAssoc.get(k);
-                .map(a ->
+			return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1()));
-                        new Tuple2<>(a,
+		})
-                                     authorsWithPids.stream()
+			.collect(Collectors.toList());
-                                        .map(e -> new Tuple2<>(e, sim(a, e)))
+		list.sort(Comparator.comparing(e -> e._1()));
-                                             .filter(t2 -> t2._2() > 0.0)
+		// ordino per max score la baseAssoc
-                                        .collect(Collectors.toList()))
+		for (int i = list.size() - 1; i >= 0; i--) {
-                )
+			Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
-                .forEach(t2 -> {
+			List<String> entries = tmp._2()._2();
-                    String base_name = t2._1().getFullname();
+			// se len = 1 => ho un solo e che con questo a ha max score
-                    String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
+			if (entries.size() == 1) {
-                    Double max_score = 0.0;
+				if (assocMap.containsKey(entries.get(0))) {
-                    List<String> enrich_name = new ArrayList();
+					enrichAuthor(assocMap.get(entries.get(0)));
-                    for (Tuple2<Author, Double> t : t2._2()) {
+					assocMap.remove(entries.get(0));
-                        //we get the fullname of the enriching
+				}
-                        String mapEntry = DHPUtils.md5(t._1().getFullname());
+			} else {
 				String author_fullname = tmp._2()._1();
 				long commonWords = 0;
 				String enriching = null;
 				for (String entry : entries) {
 					if (assocMap.containsKey(entry)) {
 						long words = getCommonWords(
 							normalize(entry),
 							normalize(author_fullname));
 						if (words > commonWords) {
 							commonWords = words;
 							enriching = entry;
 						}
 						if (words == commonWords) {
 							enriching = null;
 						}
 					}
-                        if(t._2() > max_score){
+				}
-                            max_score = t._2();
+				if (enriching != null) {
-                            enrich_name = new ArrayList();
+					enrichAuthor(assocMap.get(entries.get(0)));
-                            enrich_name.add(mapEntry);
+					assocMap.remove(entries.get(0));
-                        }else if(t._2() > 0 && t._2().equals(max_score)){
+				}
-                            enrich_name.add(mapEntry);
+				// TODO pensare ad un modo per arricchire con il miglior e questo autore
-                        }
+				// Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
 			}
 		}
 		// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
-                        AuthorAssoc aa = assocMap.get(mapEntry);
+	}
                        if(aa.getScore() < t._2()){
                            aa.setScore(t._2());
                            aa.setTo_be_enriched(new ArrayList<>());
                            aa.getTo_be_enriched().add(t2._1());
                        }else {
                            aa.getTo_be_enriched().add(t2._1());
                        }
                    }
                    if(max_score > 0){
                        baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
                    }
-                });
+	private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching) {
-        List<Tuple2<Double, Tuple2<String, List<String>>>> list = baseAssoc.keySet().stream().map(k -> {
+		return fullEnrich.stream().filter(w -> fullEnriching.contains(w)).count();
-            Tuple2<String, Tuple2<List<String>, Double>> map_entry = baseAssoc.get(k);
+	}
            return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1()));
        })
                .collect(Collectors.toList());
        list.sort(Comparator.comparing(e -> e._1()));
        //ordino per max score la baseAssoc
        for (int i = list.size() -1 ; i>=0 ; i-- ){
            Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
            List<String> entries = tmp._2()._2();
            //se len = 1 => ho un solo e che con questo a ha max score
            if(entries.size() == 1){
                if(assocMap.containsKey(entries.get(0))) {
                    enrichAuthor(assocMap.get(entries.get(0)));
                    assocMap.remove(entries.get(0));
                }
            }else{
                String author_fullname = tmp._2()._1();
                long commonWords = 0;
                String enriching = null;
                for(String entry : entries){
                    if (assocMap.containsKey(entry)){
                        long words = getCommonWords(normalize(entry),
                                normalize(author_fullname));
                        if (words > commonWords){
                            commonWords = words;
                            enriching = entry;
                        }
                        if(words == commonWords){
                            enriching = null;
                        }
                    }
-                }
+	private static void enrichAuthor(Author enrich, Author enriching) {
-                if(enriching != null){
+		// verify if some of the words in the fullname are contained in the other
-                    enrichAuthor(assocMap.get(entries.get(0)));
+		// get normalized fullname
                    assocMap.remove(entries.get(0));
                }
                //TODO pensare ad un modo per arricchire con il miglior e questo autore
                //Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
            }
        }
      //  assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
 		long commonWords = getCommonWords(
 			normalize(enrich.getFullname()),
 			normalize(enriching.getFullname()));
 		if (commonWords > 0) {
 			if (enrich.getPid() == null) {
 				enrich.setPid(new ArrayList<>());
 			}
 			Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
 			enriching.getPid().forEach(p -> {
 				if (!aPids.contains(pidToComparableString(p))) {
 					enrich.getPid().add(p);
 				}
 			});
 			if (enrich.getAffiliation() == null) {
 				if (enriching.getAffiliation() != null) {
 					enrich.setAffiliation(enriching.getAffiliation());
 				}
 			}
 		}
-    }
+	}
-    private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){
+	// Verify the number of words in common. The one that has more, wins. If the number of words in common are the same
-        return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count();
+	// we
-    }
+	// enrich no author
 	private static void enrichAuthor(AuthorAssoc authorAssoc) {
 		if (authorAssoc.getTo_be_enriched().size() == 1) {
 			enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
 		} else {
 			long common = 0;
 			List<Author> selected = new ArrayList<>();
 			for (Author a : authorAssoc.getTo_be_enriched()) {
 				long current_common = getCommonWords(
 					normalize(a.getFullname()),
 					normalize(authorAssoc.getWith_enricheing_content().getFullname()));
 				if (current_common > common) {
 					common = current_common;
 					selected = new ArrayList<>();
 					selected.add(a);
 				} else if (current_common == common) {
 					selected.add(a);
 				}
 			}
 			if (selected.size() == 1) {
 				enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
 			}
 		}
 	}
-    private static void enrichAuthor(Author enrich, Author enriching){
+	public static String pidToComparableString(StructuredProperty pid) {
-        //verify if some of the words in the fullname are contained in the other
+		return (pid.getQualifier() != null
-        //get normalized fullname
+			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
 			: "")
 			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
 	}
-        long commonWords = getCommonWords(normalize(enrich.getFullname()),
+	private static Double sim(Author a, Author b) {
-                normalize(enriching.getFullname()));
+		return new Jaccard()
-        if(commonWords > 0 ){
+			.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
            if(enrich.getPid() == null){
                enrich.setPid(new ArrayList<>());
            }
            Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
            enriching.getPid().forEach(p -> {
                if (!aPids.contains(pidToComparableString(p))){
                    enrich.getPid().add(p);
                }
            });
            if (enrich.getAffiliation() == null){
                if (enriching.getAffiliation() != null){
                    enrich.setAffiliation(enriching.getAffiliation());
                }
            }
        }
 	}
-    }
+	private static String normalizeString(String fullname) {
 		return String.join(" ", normalize(fullname));
 	}
-    //Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we
+	private static List<String> normalize(final String s) {
-    //enrich no author
+		String[] normalized = nfd(s)
-    private static void enrichAuthor(AuthorAssoc authorAssoc) {
+			.replaceAll("[^\\p{ASCII}]", "")
-        if (authorAssoc.getTo_be_enriched().size() == 1){
+			.toLowerCase()
-            enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
+			// do not compact the regexes in a single expression, would cause StackOverflowError
-        }else{
+			// in case
-            long common = 0;
+			// of large input strings
-            List<Author> selected = new ArrayList<>() ;
+			.replaceAll("(\\W)+", " ")
-            for(Author a : authorAssoc.getTo_be_enriched()){
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-                long current_common = getCommonWords(normalize(a.getFullname()),
+			.replaceAll("(\\p{Punct})+", " ")
-                        normalize(authorAssoc.getWith_enricheing_content().getFullname()));
+			.replaceAll("(\\d)+", " ")
-                if (current_common > common){
+			.replaceAll("(\\n)+", " ")
-                    common = current_common;
+			.trim()
-                    selected = new ArrayList<>();
+			.split(" ");
                    selected.add(a);
                }else if(current_common == common){
                    selected.add(a);
                }
            }
            if (selected.size() == 1){
                enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
            }
        }
-    }
+		Arrays.sort(normalized);
 		return Arrays.asList(normalized);
-    public static String pidToComparableString(StructuredProperty pid) {
+	}
        return (pid.getQualifier() != null
                ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
                : "")
                + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
    }
-
+	private static String nfd(final String s) {
-
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
-
+	}
    private static Double sim(Author a, Author b) {
        return new Jaccard()
                .score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
    }
    private static String normalizeString(String fullname) {
        return String.join(" ", normalize(fullname));
    }
    private static List<String> normalize(final String s) {
        String[] normalized = nfd(s)
                .replaceAll("[^\\p{ASCII}]", "")
                .toLowerCase()
                // do not compact the regexes in a single expression, would cause StackOverflowError
                // in case
                // of large input strings
                .replaceAll("(\\W)+", " ")
                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
                .replaceAll("(\\p{Punct})+", " ")
                .replaceAll("(\\d)+", " ")
                .replaceAll("(\\n)+", " ")
                .trim()
                .split(" ");
        Arrays.sort(normalized);
        return Arrays.asList(normalized);
    }
    private static String nfd(final String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }
 }