From 910abcba049dbe2d87b1dd8fcf81f5f345999280 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 22 Nov 2021 16:54:27 +0100 Subject: [PATCH] [DoiBoost Author merger] - --- .../java/eu/dnetlib/doiboost/AuthorAssoc.java | 61 +-- .../doiboost/DoiBoostAuthorMerger.java | 440 +++++++++--------- 2 files changed, 246 insertions(+), 255 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java index 5bba7d63a..86f24256c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/AuthorAssoc.java @@ -1,52 +1,53 @@ -package eu.dnetlib.doiboost; -import eu.dnetlib.dhp.schema.oaf.Author; +package eu.dnetlib.doiboost; import java.io.Serializable; import java.util.ArrayList; import java.util.List; +import eu.dnetlib.dhp.schema.oaf.Author; + /** * This class stores the association information between the enriching author and the possibly enriched ones. * It also contains the value of the similarity score between the enriching author and the possibly enriched ones. * Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list. */ public class AuthorAssoc implements Serializable { - private Double score ; - private List to_be_enriched; - private Author with_enricheing_content; + private Double score; + private List to_be_enriched; + private Author with_enricheing_content; - public Double getScore() { - return score; - } + public Double getScore() { + return score; + } - public void setScore(Double score) { - this.score = score; - } + public void setScore(Double score) { + this.score = score; + } - public List getTo_be_enriched() { - return to_be_enriched; - } + public List getTo_be_enriched() { + return to_be_enriched; + } - public void setTo_be_enriched(List to_be_enriched) { - this.to_be_enriched = to_be_enriched; - } + public void setTo_be_enriched(List to_be_enriched) { + this.to_be_enriched = to_be_enriched; + } - public Author getWith_enricheing_content() { - return with_enricheing_content; - } + public Author getWith_enricheing_content() { + return with_enricheing_content; + } - public void setWith_enricheing_content(Author with_enricheing_content) { - this.with_enricheing_content = with_enricheing_content; - } + public void setWith_enricheing_content(Author with_enricheing_content) { + this.with_enricheing_content = with_enricheing_content; + } - public static AuthorAssoc newInstance(Author a){ - AuthorAssoc ret = new AuthorAssoc(); - ret.score = 0.0; - ret.to_be_enriched = new ArrayList<>(); - ret.with_enricheing_content = a; + public static AuthorAssoc newInstance(Author a) { + AuthorAssoc ret = new AuthorAssoc(); + ret.score = 0.0; + ret.to_be_enriched = new ArrayList<>(); + ret.with_enricheing_content = a; - return ret; + return ret; - } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java index a7d8e47d6..dc77422b6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java @@ -6,14 +6,12 @@ import java.util.*; import java.util.stream.Collectors; import com.wcohen.ss.Jaccard; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.utils.DHPUtils; - import com.wcohen.ss.JaroWinkler; import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; - +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; /** @@ -49,258 +47,250 @@ import scala.Tuple2; public class DoiBoostAuthorMerger { + public static List merge(List> authors, Boolean crossref) { - public static List merge(List> authors, Boolean crossref) { + Iterator> it = authors.iterator(); + List author = it.next(); - Iterator> it = authors.iterator(); - List author = it.next(); + while (it.hasNext()) { + List autList = it.next(); + Tuple2, Boolean> tmp = mergeAuthor(author, autList, crossref); + author = tmp._1(); + crossref = tmp._2(); + } - while (it.hasNext()){ - List autList = it.next(); - Tuple2, Boolean> tmp = mergeAuthor(author, autList, crossref); - author = tmp._1(); - crossref = tmp._2(); - } + return author; - return author; + } - } + // If we have a list of authors coming from crossref we take that and we enrich it + // If we do not have a list of authors coming from crossref we enrich the longest at each step + public static Tuple2, Boolean> mergeAuthor(final List baseAuthor, + final List otherAuthor, + final Boolean crossref) { - //If we have a list of authors coming from crossref we take that and we enrich it - //If we do not have a list of authors coming from crossref we enrich the longest at each step - public static Tuple2, Boolean> mergeAuthor(final List baseAuthor, final List otherAuthor, - final Boolean crossref) { + if (baseAuthor == null || baseAuthor.size() == 0) + return new Tuple2<>(otherAuthor, false); + if (otherAuthor == null || otherAuthor.size() == 0) + return new Tuple2<>(baseAuthor, crossref); - if(baseAuthor == null || baseAuthor.size() == 0) - return new Tuple2<>(otherAuthor, false); - if(otherAuthor == null || otherAuthor.size() == 0) - return new Tuple2<>(baseAuthor, crossref); + if (crossref) { + enrichPidFromList(baseAuthor, otherAuthor); + return new Tuple2<>(baseAuthor, true); + } else if (baseAuthor.size() > otherAuthor.size()) { + enrichPidFromList(baseAuthor, otherAuthor); + return new Tuple2<>(baseAuthor, false); + } else { + enrichPidFromList(otherAuthor, baseAuthor); + return new Tuple2<>(otherAuthor, false); + } - if(crossref) { - enrichPidFromList(baseAuthor, otherAuthor); - return new Tuple2<>(baseAuthor, true); - } - else - if (baseAuthor.size() > otherAuthor.size()){ - enrichPidFromList(baseAuthor, otherAuthor); - return new Tuple2<>(baseAuthor, false); - }else{ - enrichPidFromList(otherAuthor, baseAuthor); - return new Tuple2<>(otherAuthor, false); - } + } - } + // valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia + // nella base list non il contrario + private static void enrichPidFromList(List base, List enrich) { + // search authors having identifiers in the enrich list + final List authorsWithPids = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .collect(Collectors.toList()); - //valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia - //nella base list non il contrario - private static void enrichPidFromList(List base, List enrich) { + Map assocMap = authorsWithPids + .stream() + .map( + a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - //search authors having identifiers in the enrich list - final List authorsWithPids = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .collect(Collectors.toList()); + Map, Double>>> baseAssoc = new HashMap<>(); - Map assocMap = authorsWithPids - .stream() - .map( - a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + // for each author in the base list, we search the best enriching match + // we create the association (author, list of (enriching author, similatiry score)) + base + .stream() + .map( + a -> new Tuple2<>(a, + authorsWithPids + .stream() + .map(e -> new Tuple2<>(e, sim(a, e))) + .filter(t2 -> t2._2() > 0.0) + .collect(Collectors.toList()))) + .forEach(t2 -> { + String base_name = t2._1().getFullname(); + String base_name_md5 = DHPUtils.md5(t2._1().getFullname()); + Double max_score = 0.0; + List enrich_name = new ArrayList(); + for (Tuple2 t : t2._2()) { + // we get the fullname of the enriching + String mapEntry = DHPUtils.md5(t._1().getFullname()); - Map, Double>>> baseAssoc = new HashMap<>(); + if (t._2() > max_score) { + max_score = t._2(); + enrich_name = new ArrayList(); + enrich_name.add(mapEntry); + } else if (t._2() > 0 && t._2().equals(max_score)) { + enrich_name.add(mapEntry); + } + AuthorAssoc aa = assocMap.get(mapEntry); + if (aa.getScore() < t._2()) { + aa.setScore(t._2()); + aa.setTo_be_enriched(new ArrayList<>()); + aa.getTo_be_enriched().add(t2._1()); + } else { + aa.getTo_be_enriched().add(t2._1()); + } + } + if (max_score > 0) { + baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score))); + } - //for each author in the base list, we search the best enriching match - //we create the association (author, list of (enriching author, similatiry score)) - base.stream() - .map(a -> - new Tuple2<>(a, - authorsWithPids.stream() - .map(e -> new Tuple2<>(e, sim(a, e))) - .filter(t2 -> t2._2() > 0.0) - .collect(Collectors.toList())) - ) - .forEach(t2 -> { - String base_name = t2._1().getFullname(); - String base_name_md5 = DHPUtils.md5(t2._1().getFullname()); - Double max_score = 0.0; - List enrich_name = new ArrayList(); - for (Tuple2 t : t2._2()) { - //we get the fullname of the enriching - String mapEntry = DHPUtils.md5(t._1().getFullname()); + }); + List>>> list = baseAssoc.keySet().stream().map(k -> { + Tuple2, Double>> map_entry = baseAssoc.get(k); + return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1())); + }) + .collect(Collectors.toList()); + list.sort(Comparator.comparing(e -> e._1())); + // ordino per max score la baseAssoc + for (int i = list.size() - 1; i >= 0; i--) { + Tuple2>> tmp = list.get(i); + List entries = tmp._2()._2(); + // se len = 1 => ho un solo e che con questo a ha max score + if (entries.size() == 1) { + if (assocMap.containsKey(entries.get(0))) { + enrichAuthor(assocMap.get(entries.get(0))); + assocMap.remove(entries.get(0)); + } + } else { + String author_fullname = tmp._2()._1(); + long commonWords = 0; + String enriching = null; + for (String entry : entries) { + if (assocMap.containsKey(entry)) { + long words = getCommonWords( + normalize(entry), + normalize(author_fullname)); + if (words > commonWords) { + commonWords = words; + enriching = entry; + } + if (words == commonWords) { + enriching = null; + } + } - if(t._2() > max_score){ - max_score = t._2(); - enrich_name = new ArrayList(); - enrich_name.add(mapEntry); - }else if(t._2() > 0 && t._2().equals(max_score)){ - enrich_name.add(mapEntry); - } + } + if (enriching != null) { + enrichAuthor(assocMap.get(entries.get(0))); + assocMap.remove(entries.get(0)); + } + // TODO pensare ad un modo per arricchire con il miglior e questo autore + // Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score + } + } + // assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k))); - AuthorAssoc aa = assocMap.get(mapEntry); - if(aa.getScore() < t._2()){ - aa.setScore(t._2()); - aa.setTo_be_enriched(new ArrayList<>()); - aa.getTo_be_enriched().add(t2._1()); - }else { - aa.getTo_be_enriched().add(t2._1()); - } - } - if(max_score > 0){ - baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score))); - } + } - }); - List>>> list = baseAssoc.keySet().stream().map(k -> { - Tuple2, Double>> map_entry = baseAssoc.get(k); - return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1())); - }) - .collect(Collectors.toList()); - list.sort(Comparator.comparing(e -> e._1())); - //ordino per max score la baseAssoc - for (int i = list.size() -1 ; i>=0 ; i-- ){ - Tuple2>> tmp = list.get(i); - List entries = tmp._2()._2(); - //se len = 1 => ho un solo e che con questo a ha max score - if(entries.size() == 1){ - if(assocMap.containsKey(entries.get(0))) { - enrichAuthor(assocMap.get(entries.get(0))); - assocMap.remove(entries.get(0)); - } - }else{ - String author_fullname = tmp._2()._1(); - long commonWords = 0; - String enriching = null; - for(String entry : entries){ - if (assocMap.containsKey(entry)){ - long words = getCommonWords(normalize(entry), - normalize(author_fullname)); - if (words > commonWords){ - commonWords = words; - enriching = entry; - } - if(words == commonWords){ - enriching = null; - } - } + private static long getCommonWords(List fullEnrich, List fullEnriching) { + return fullEnrich.stream().filter(w -> fullEnriching.contains(w)).count(); + } - } - if(enriching != null){ - enrichAuthor(assocMap.get(entries.get(0))); - assocMap.remove(entries.get(0)); - } - //TODO pensare ad un modo per arricchire con il miglior e questo autore - //Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score - } - } - // assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k))); + private static void enrichAuthor(Author enrich, Author enriching) { + // verify if some of the words in the fullname are contained in the other + // get normalized fullname + long commonWords = getCommonWords( + normalize(enrich.getFullname()), + normalize(enriching.getFullname())); + if (commonWords > 0) { + if (enrich.getPid() == null) { + enrich.setPid(new ArrayList<>()); + } + Set aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet()); + enriching.getPid().forEach(p -> { + if (!aPids.contains(pidToComparableString(p))) { + enrich.getPid().add(p); + } + }); + if (enrich.getAffiliation() == null) { + if (enriching.getAffiliation() != null) { + enrich.setAffiliation(enriching.getAffiliation()); + } + } + } - } + } - private static long getCommonWords(List fullEnrich, List fullEnriching){ - return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count(); - } + // Verify the number of words in common. The one that has more, wins. If the number of words in common are the same + // we + // enrich no author + private static void enrichAuthor(AuthorAssoc authorAssoc) { + if (authorAssoc.getTo_be_enriched().size() == 1) { + enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content()); + } else { + long common = 0; + List selected = new ArrayList<>(); + for (Author a : authorAssoc.getTo_be_enriched()) { + long current_common = getCommonWords( + normalize(a.getFullname()), + normalize(authorAssoc.getWith_enricheing_content().getFullname())); + if (current_common > common) { + common = current_common; + selected = new ArrayList<>(); + selected.add(a); + } else if (current_common == common) { + selected.add(a); + } + } + if (selected.size() == 1) { + enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content()); + } + } + } - private static void enrichAuthor(Author enrich, Author enriching){ - //verify if some of the words in the fullname are contained in the other - //get normalized fullname + public static String pidToComparableString(StructuredProperty pid) { + return (pid.getQualifier() != null + ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" + : "") + + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } - long commonWords = getCommonWords(normalize(enrich.getFullname()), - normalize(enriching.getFullname())); - if(commonWords > 0 ){ - if(enrich.getPid() == null){ - enrich.setPid(new ArrayList<>()); - } - Set aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet()); - enriching.getPid().forEach(p -> { - if (!aPids.contains(pidToComparableString(p))){ - enrich.getPid().add(p); - } - }); - if (enrich.getAffiliation() == null){ - if (enriching.getAffiliation() != null){ - enrich.setAffiliation(enriching.getAffiliation()); - } - } - } + private static Double sim(Author a, Author b) { + return new Jaccard() + .score(normalizeString(a.getFullname()), normalizeString(b.getFullname())); + } - } + private static String normalizeString(String fullname) { + return String.join(" ", normalize(fullname)); + } - //Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we - //enrich no author - private static void enrichAuthor(AuthorAssoc authorAssoc) { - if (authorAssoc.getTo_be_enriched().size() == 1){ - enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content()); - }else{ - long common = 0; - List selected = new ArrayList<>() ; - for(Author a : authorAssoc.getTo_be_enriched()){ - long current_common = getCommonWords(normalize(a.getFullname()), - normalize(authorAssoc.getWith_enricheing_content().getFullname())); - if (current_common > common){ - common = current_common; - selected = new ArrayList<>(); - selected.add(a); - }else if(current_common == common){ - selected.add(a); - } - } - if (selected.size() == 1){ - enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content()); - } - } + private static List normalize(final String s) { + String[] normalized = nfd(s) + .replaceAll("[^\\p{ASCII}]", "") + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim() + .split(" "); - } + Arrays.sort(normalized); + return Arrays.asList(normalized); - public static String pidToComparableString(StructuredProperty pid) { - return (pid.getQualifier() != null - ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" - : "") - + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); - } + } - - - - private static Double sim(Author a, Author b) { - return new Jaccard() - .score(normalizeString(a.getFullname()), normalizeString(b.getFullname())); - - } - - private static String normalizeString(String fullname) { - return String.join(" ", normalize(fullname)); - } - - - private static List normalize(final String s) { - String[] normalized = nfd(s) - .replaceAll("[^\\p{ASCII}]", "") - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim() - .split(" "); - - Arrays.sort(normalized); - - return Arrays.asList(normalized); - - - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } }