From d33f578e544c23eb8e1a1dd4edfd79b0a42d854a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 1 Dec 2023 15:14:17 +0100 Subject: [PATCH] code formatting --- .../eu/dnetlib/dhp/oa/merge/AuthorMerger.java | 511 +++++++++--------- 1 file changed, 256 insertions(+), 255 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index 0153e52d5..0461c9353 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -4,318 +4,319 @@ package eu.dnetlib.dhp.oa.merge; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; + import com.wcohen.ss.JaroWinkler; + import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.pace.model.Person; import scala.Tuple2; - public class AuthorMerger { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - private AuthorMerger() { - } + private AuthorMerger() { + } - public static List merge(List> authors) { + public static List merge(List> authors) { - authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); + authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); - List author = new ArrayList<>(); + List author = new ArrayList<>(); - for (List a : authors) { - author = mergeAuthor(author, a); - } + for (List a : authors) { + author = mergeAuthor(author, a); + } - return author; + return author; - } + } - public static List mergeAuthor(final List a, final List b, Double threshold) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base; - List enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b, Double threshold) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base; + List enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (sa == sb) { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } else { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } - enrichPidFromList(base, enrich, threshold); - return base; - } + if (sa == sb) { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } else { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } + enrichPidFromList(base, enrich, threshold); + return base; + } - public static List mergeAuthor(final List a, final List b) { - return mergeAuthor(a, b, THRESHOLD); - } + public static List mergeAuthor(final List a, final List b) { + return mergeAuthor(a, b, THRESHOLD); + } - private static void enrichPidFromList(List base, List enrich, Double threshold) { - if (base == null || enrich == null) - return; + private static void enrichPidFromList(List base, List enrich, Double threshold) { + if (base == null || enrich == null) + return; - // (if an Author has more than 1 pid, it appears 2 times in the list) - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .flatMap( - a -> a - .getPid() - .stream() - .filter(Objects::nonNull) - .map(p -> new Tuple2<>(pidToComparableString(p), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + // (if an Author has more than 1 pid, it appears 2 times in the list) + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .flatMap( + a -> a + .getPid() + .stream() + .filter(Objects::nonNull) + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - // (list of pid that are missing in the other list) - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .flatMap( - a -> a - .getPid() - .stream() - .filter(Objects::nonNull) - .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + // (list of pid that are missing in the other list) + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .flatMap( + a -> a + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich - .forEach( - a -> { - Optional> simAuthor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); - if (simAuthor.isPresent()) { - double th = threshold; - // increase the threshold if the surname is too short - if (simAuthor.get()._2().getSurname() != null - && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) - th = 0.99; + if (simAuthor.isPresent()) { + double th = threshold; + // increase the threshold if the surname is too short + if (simAuthor.get()._2().getSurname() != null + && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) + th = 0.99; - if (simAuthor.get()._1() > th) { - Author r = simAuthor.get()._2(); - if (r.getPid() == null) { - r.setPid(new ArrayList<>()); - } + if (simAuthor.get()._1() > th) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } - // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, - // it creates of fixed size, and the add method raise UnsupportedOperationException at - // java.util.AbstractList.add - final List tmp = new ArrayList<>(r.getPid()); - tmp.add(a._1()); - r.setPid(tmp); - } - } - }); - } + // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, + // it creates of fixed size, and the add method raise UnsupportedOperationException at + // java.util.AbstractList.add + final List tmp = new ArrayList<>(r.getPid()); + tmp.add(a._1()); + r.setPid(tmp); + } + } + }); + } - public static String normalizeFullName(final String fullname) { - return nfd(fullname) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") + public static String normalizeFullName(final String fullname) { + return nfd(fullname) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") - .trim(); - } + .trim(); + } + private static String authorFieldToBeCompared(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return author.getSurname(); - private static String authorFieldToBeCompared(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return author.getSurname(); + } + if (StringUtils.isNotBlank(author.getFullname())) { + return author.getFullname(); + } + return null; + } - } - if (StringUtils.isNotBlank(author.getFullname())) { - return author.getFullname(); - } - return null; - } + /** + * This method tries to figure out when two author are the same in the contest + * of ORCID enrichment + * + * @param left Author in the OAF entity + * @param right Author ORCID + * @return based on a heuristic on the names of the authors if they are the same. + */ + public static boolean checkORCIDSimilarity(final Author left, final Author right) { + final Person pl = parse(left); + final Person pr = parse(right); - /** - * This method tries to figure out when two author are the same in the contest - * of ORCID enrichment - * - * @param left Author in the OAF entity - * @param right Author ORCID - * @return based on a heuristic on the names of the authors if they are the same. - */ - public static boolean checkORCIDSimilarity(final Author left, final Author right) { - final Person pl = parse(left); - final Person pr = parse(right); + // If one of them didn't have a surname we verify if they have the fullName not empty + // and verify if the normalized version is equal + if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) && + pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) { - // If one of them didn't have a surname we verify if they have the fullName not empty - // and verify if the normalized version is equal - if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) && - pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) { + if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null + && !pr.getFullname().isEmpty()) { + return pl + .getFullname() + .stream() + .anyMatch( + fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr)))); + } else { + return false; + } + } + // The Authors have one surname in common + if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) { - if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null - && !pr.getFullname().isEmpty()) { - return pl - .getFullname() - .stream() - .anyMatch( - fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr)))); - } else { - return false; - } - } - // The Authors have one surname in common - if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) { + // If one of them has only a surname and is the same we can say that they are the same author + if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) || + (pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank))) + return true; + // The authors have the same initials of Name in common + if (pl + .getName() + .stream() + .anyMatch( + nl -> pr + .getName() + .stream() + .anyMatch(nr -> nr.equalsIgnoreCase(nl)))) + return true; + } - // If one of them has only a surname and is the same we can say that they are the same author - if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) || - (pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank))) - return true; - // The authors have the same initials of Name in common - if (pl - .getName() - .stream() - .anyMatch( - nl -> pr - .getName() - .stream() - .anyMatch(nr -> nr.equalsIgnoreCase(nl)))) - return true; - } + // Sometimes we noticed that publication have author wrote in inverse order Surname, Name + // We verify if we have an exact match between name and surname + if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) && + pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl)))) + return true; + else + return false; + } + // - // Sometimes we noticed that publication have author wrote in inverse order Surname, Name - // We verify if we have an exact match between name and surname - if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) && - pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl)))) - return true; - else - return false; - } - // + /** + * Method to enrich ORCID information in one list of authors based on another list + * + * @param baseAuthor the Author List in the OAF Entity + * @param orcidAuthor The list of ORCID Author intersected + * @return The Author List of the OAF Entity enriched with the orcid Author + */ + public static List enrichOrcid(List baseAuthor, List orcidAuthor) { - /** - * Method to enrich ORCID information in one list of authors based on another list - * - * @param baseAuthor the Author List in the OAF Entity - * @param orcidAuthor The list of ORCID Author intersected - * @return The Author List of the OAF Entity enriched with the orcid Author - */ - public static List enrichOrcid(List baseAuthor, List orcidAuthor) { + if (baseAuthor == null || baseAuthor.isEmpty()) + return orcidAuthor; - if (baseAuthor == null || baseAuthor.isEmpty()) - return orcidAuthor; + if (orcidAuthor == null || orcidAuthor.isEmpty()) + return baseAuthor; - if (orcidAuthor == null || orcidAuthor.isEmpty()) - return baseAuthor; + if (baseAuthor.size() == 1 && orcidAuthor.size() > 10) + return baseAuthor; - if (baseAuthor.size() == 1 && orcidAuthor.size() > 10) - return baseAuthor; + final List oAuthor = new ArrayList<>(); + oAuthor.addAll(orcidAuthor); - final List oAuthor = new ArrayList<>(); - oAuthor.addAll(orcidAuthor); + baseAuthor.forEach(ba -> { + Optional aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst(); + if (aMatch.isPresent()) { + final Author sameAuthor = aMatch.get(); + addPid(ba, sameAuthor.getPid()); + oAuthor.remove(sameAuthor); + } + }); + return baseAuthor; + } - baseAuthor.forEach(ba -> { - Optional aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst(); - if (aMatch.isPresent()) { - final Author sameAuthor = aMatch.get(); - addPid(ba, sameAuthor.getPid()); - oAuthor.remove(sameAuthor); - } - }); - return baseAuthor; - } + private static void addPid(final Author a, final List pids) { - private static void addPid(final Author a, final List pids) { + if (a.getPid() == null) { + a.setPid(new ArrayList<>()); + } - if (a.getPid() == null) { - a.setPid(new ArrayList<>()); - } + a.getPid().addAll(pids); - a.getPid().addAll(pids); + } - } + public static String pidToComparableString(StructuredProperty pid) { + final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() + : ""; + return (pid.getQualifier() != null ? classid : "") + + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } - public static String pidToComparableString(StructuredProperty pid) { - final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() - : ""; - return (pid.getQualifier() != null ? classid : "") - + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); - } + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - public static int countAuthorsPids(List authors) { - if (authors == null) - return 0; + return (int) authors.stream().filter(AuthorMerger::hasPid).count(); + } - return (int) authors.stream().filter(AuthorMerger::hasPid).count(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } + private static Double sim(Author a, Author b) { - private static Double sim(Author a, Author b) { + final Person pa = parse(a); + final Person pb = parse(b); - final Person pa = parse(a); - final Person pb = parse(b); + // if both are accurate (e.g. they have name and surname) + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 + + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - // if both are accurate (e.g. they have name and surname) - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 - + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().isEmpty()) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().isEmpty()) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + if (StringUtils.isNotBlank(author.getFullname())) + return new Person(author.getFullname(), false); + else + return new Person("", false); + } + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - if (StringUtils.isNotBlank(author.getFullname())) - return new Person(author.getFullname(), false); - else - return new Person("", false); - } - } + public static String normalize(final String s) { + String[] normalized = nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim() + .split(" "); - public static String normalize(final String s) { - String[] normalized = nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim() - .split(" "); + Arrays.sort(normalized); - Arrays.sort(normalized); + return String.join(" ", normalized); + } - return String.join(" ", normalized); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } }