From 97e0c27db954d3adde7f0d4c1b271c28d99cf996 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 8 Jul 2021 15:27:05 +0200 Subject: [PATCH] Added check for empty author list. If crossref is empty, the longest from all the merging providers is taken. If crossref is not empty, crossref is chosen as base for the enrichment --- .../doiboost/DoiBoostAuthorMerger.java | 226 ++++++++++-------- 1 file changed, 129 insertions(+), 97 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java index 741df13ff..537dc11a3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java @@ -5,101 +5,159 @@ import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.utils.DHPUtils; import com.wcohen.ss.JaroWinkler; -import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.model.Person; + import scala.Tuple2; public class DoiBoostAuthorMerger { - private static final Double THRESHOLD = 0.95; - public static List merge(List> authors) { + public static List merge(List> authors, Boolean crossref) { Iterator> it = authors.iterator(); - final List author = it.next(); + List author = it.next(); - it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD)); + while (it.hasNext()){ + List autList = it.next(); + Tuple2, Boolean> tmp = mergeAuthor(author, autList, crossref); + author = tmp._1(); + crossref = tmp._2(); + } return author; } - public static List mergeAuthor(final List crossrefAuthor, final List otherAuthor, - Double threshold) { + public static Tuple2, Boolean> mergeAuthor(final List baseAuthor, final List otherAuthor, + final Boolean crossref) { + + if(baseAuthor == null || baseAuthor.size() == 0) + return new Tuple2<>(otherAuthor, false); + if(otherAuthor == null || otherAuthor.size() == 0) + return new Tuple2<>(baseAuthor, crossref); + + if(crossref) { + enrichPidFromList(baseAuthor, otherAuthor); + return new Tuple2<>(baseAuthor, true); + } + else + if (baseAuthor.size() > otherAuthor.size()){ + enrichPidFromList(baseAuthor, otherAuthor); + return new Tuple2<>(baseAuthor, false); + }else{ + enrichPidFromList(otherAuthor, baseAuthor); + return new Tuple2<>(otherAuthor, false); + } - enrichPidFromList(crossrefAuthor, otherAuthor, threshold); - return crossrefAuthor; } - public static List mergeAuthor(final List crossrefAuthor, final List otherAuthor) { - return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD); - } - private static void enrichPidFromList(List base, List enrich, Double threshold) { - if (base == null || enrich == null) - return; + private static void enrichPidFromList(List base, List enrich) { + if(base == null || enrich == null) + return ; - // (if an Author has more than 1 pid, it appears 2 times in the list) - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .map(p -> new Tuple2<>(pidToComparableString(p), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + //search authors having identifiers in the enrich list + final List authorsWithPids = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .collect(Collectors.toList()); - // (list of pid that are missing in the other list) - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + Map assocMap = authorsWithPids + .stream() + .map( + a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - pidToEnrich - .forEach( - a -> { - Optional> simAuthor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuthor.isPresent()) { - double th = threshold; - // increase the threshold if the surname is too short - if (simAuthor.get()._2().getSurname() != null - && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) - th = 0.99; + //for each author in the base list, we search the best enriched match + base.stream() + .map(a -> new Tuple2<>(a, authorsWithPids.stream() + .map(e -> new Tuple2<>(e, sim(a, e))).collect(Collectors.toList()))) + .forEach(t2 -> { - if (simAuthor.get()._1() > th) { - Author r = simAuthor.get()._2(); - if (r.getPid() == null) { - r.setPid(new ArrayList<>()); - } - - // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, - // it creates of fixed size, and the add method raise UnsupportedOperationException at - // java.util.AbstractList.add - final List tmp = new ArrayList<>(r.getPid()); - tmp.add(a._1()); - r.setPid(tmp); + for (Tuple2 t : t2._2()) { + String mapEntry = DHPUtils.md5(t._1().getFullname()); + AuthorAssoc aa = assocMap.get(mapEntry); + if(aa.getScore() < t._2()){ + aa.setScore(t._2()); + aa.setTo_be_enriched(new ArrayList<>()); + aa.getTo_be_enriched().add(t2._1()); + }else if(aa.getScore() == t._2()){ + aa.getTo_be_enriched().add(t2._1()); } + } + + }); + + assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k))); + + + } + + private static long getCommonWords(List fullEnrich, List fullEnriching){ + return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count(); + } + + + private static void enrichAuthor(Author enrich, Author enriching){ + //verify if some of the words in the fullname are contained in the other + //get normalized fullname + + long commonWords = getCommonWords(normalize(enrich.getFullname()), + normalize(enriching.getFullname())); + if(commonWords > 0 ){ + if(enrich.getPid() == null){ + enrich.setPid(new ArrayList<>()); + } + Set aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet()); + enriching.getPid().forEach(p -> { + if (!aPids.contains(pidToComparableString(p))){ + enrich.getPid().add(p); } }); + if (enrich.getAffiliation() == null){ + if (enriching.getAffiliation() != null){ + enrich.setAffiliation(enriching.getAffiliation()); + } + } + } + + } + //Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we + //enrich no author + private static void enrichAuthor(AuthorAssoc authorAssoc) { + if (authorAssoc.getTo_be_enriched().size() == 1){ + enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content()); + }else{ + long common = 0; + List selected = new ArrayList<>() ; + for(Author a : authorAssoc.getTo_be_enriched()){ + long current_common = getCommonWords(normalize(a.getFullname()), + normalize(authorAssoc.getWith_enricheing_content().getFullname())); + if (current_common > common){ + common = current_common; + selected = new ArrayList<>(); + selected.add(a); + }else if(current_common == common){ + selected.add(a); + } + } + if (selected.size() == 1){ + enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content()); + } + } + + } + + public static String pidToComparableString(StructuredProperty pid) { return (pid.getQualifier() != null ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" @@ -107,49 +165,21 @@ public class DoiBoostAuthorMerger { + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); } - public static int countAuthorsPids(List authors) { - if (authors == null) - return 0; - return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count(); - } - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } private static Double sim(Author a, Author b) { - - final Person pa = parse(a); - final Person pb = parse(b); - - // if both are accurate (e.g. they have name and surname) - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 - + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; - } else { return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } + .score(normalizeString(a.getFullname()), normalizeString(b.getFullname())); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + private static String normalizeString(String fullname) { + return String.join(" ", normalize(fullname)); } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } - private static String normalize(final String s) { + private static List normalize(final String s) { String[] normalized = nfd(s) .replaceAll("[^\\p{ASCII}]", "") .toLowerCase() @@ -166,7 +196,9 @@ public class DoiBoostAuthorMerger { Arrays.sort(normalized); - return String.join(" ", normalized); + return Arrays.asList(normalized); + + } private static String nfd(final String s) {