code formatting

2023-12-01 15:14:17 +01:00 · 2023-12-01 15:14:17 +01:00 · d33f578e54
parent c5ac593c07
commit d33f578e54
1 changed files with 256 additions and 255 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -4,318 +4,319 @@ package eu.dnetlib.dhp.oa.merge;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.model.Person;
 import scala.Tuple2;
 public class AuthorMerger {
-    private static final Double THRESHOLD = 0.95;
+	private static final Double THRESHOLD = 0.95;
-    private AuthorMerger() {
+	private AuthorMerger() {
-    }
+	}
-    public static List<Author> merge(List<List<Author>> authors) {
+	public static List<Author> merge(List<List<Author>> authors) {
-        authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
+		authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
-        List<Author> author = new ArrayList<>();
+		List<Author> author = new ArrayList<>();
-        for (List<Author> a : authors) {
+		for (List<Author> a : authors) {
-            author = mergeAuthor(author, a);
+			author = mergeAuthor(author, a);
-        }
+		}
-        return author;
+		return author;
-    }
+	}
-    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
-        int pa = countAuthorsPids(a);
+		int pa = countAuthorsPids(a);
-        int pb = countAuthorsPids(b);
+		int pb = countAuthorsPids(b);
-        List<Author> base;
+		List<Author> base;
-        List<Author> enrich;
+		List<Author> enrich;
-        int sa = authorsSize(a);
+		int sa = authorsSize(a);
-        int sb = authorsSize(b);
+		int sb = authorsSize(b);
-        if (sa == sb) {
+		if (sa == sb) {
-            base = pa > pb ? a : b;
+			base = pa > pb ? a : b;
-            enrich = pa > pb ? b : a;
+			enrich = pa > pb ? b : a;
-        } else {
+		} else {
-            base = sa > sb ? a : b;
+			base = sa > sb ? a : b;
-            enrich = sa > sb ? b : a;
+			enrich = sa > sb ? b : a;
-        }
+		}
-        enrichPidFromList(base, enrich, threshold);
+		enrichPidFromList(base, enrich, threshold);
-        return base;
+		return base;
-    }
+	}
-    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
-        return mergeAuthor(a, b, THRESHOLD);
+		return mergeAuthor(a, b, THRESHOLD);
-    }
+	}
-    private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
-        if (base == null || enrich == null)
+		if (base == null || enrich == null)
-            return;
+			return;
-        // <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
+		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
-        final Map<String, Author> basePidAuthorMap = base
+		final Map<String, Author> basePidAuthorMap = base
-                .stream()
+			.stream()
-                .filter(a -> a.getPid() != null && !a.getPid().isEmpty())
+			.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
-                .flatMap(
+			.flatMap(
-                        a -> a
+				a -> a
-                                .getPid()
+					.getPid()
-                                .stream()
+					.stream()
-                                .filter(Objects::nonNull)
+					.filter(Objects::nonNull)
-                                .map(p -> new Tuple2<>(pidToComparableString(p), a)))
+					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
-                .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-        // <pid, Author> (list of pid that are missing in the other list)
+		// <pid, Author> (list of pid that are missing in the other list)
-        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
-                .stream()
+			.stream()
-                .filter(a -> a.getPid() != null && !a.getPid().isEmpty())
+			.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
-                .flatMap(
+			.flatMap(
-                        a -> a
+				a -> a
-                                .getPid()
+					.getPid()
-                                .stream()
+					.stream()
-                                .filter(Objects::nonNull)
+					.filter(Objects::nonNull)
-                                .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
+					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
-                                .map(p -> new Tuple2<>(p, a)))
+					.map(p -> new Tuple2<>(p, a)))
-                .collect(Collectors.toList());
+			.collect(Collectors.toList());
-        pidToEnrich
+		pidToEnrich
-                .forEach(
+			.forEach(
-                        a -> {
+				a -> {
-                            Optional<Tuple2<Double, Author>> simAuthor = base
+					Optional<Tuple2<Double, Author>> simAuthor = base
-                                    .stream()
+						.stream()
-                                    .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
+						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
-                                    .max(Comparator.comparing(Tuple2::_1));
+						.max(Comparator.comparing(Tuple2::_1));
-                            if (simAuthor.isPresent()) {
+					if (simAuthor.isPresent()) {
-                                double th = threshold;
+						double th = threshold;
-                                // increase the threshold if the surname is too short
+						// increase the threshold if the surname is too short
-                                if (simAuthor.get()._2().getSurname() != null
+						if (simAuthor.get()._2().getSurname() != null
-                                        && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
-                                    th = 0.99;
+							th = 0.99;
-                                if (simAuthor.get()._1() > th) {
+						if (simAuthor.get()._1() > th) {
-                                    Author r = simAuthor.get()._2();
+							Author r = simAuthor.get()._2();
-                                    if (r.getPid() == null) {
+							if (r.getPid() == null) {
-                                        r.setPid(new ArrayList<>());
+								r.setPid(new ArrayList<>());
-                                    }
+							}
-                                    // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
+							// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
-                                    // it creates of fixed size, and the add method raise UnsupportedOperationException at
+							// it creates of fixed size, and the add method raise UnsupportedOperationException at
-                                    // java.util.AbstractList.add
+							// java.util.AbstractList.add
-                                    final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
+							final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
-                                    tmp.add(a._1());
+							tmp.add(a._1());
-                                    r.setPid(tmp);
+							r.setPid(tmp);
-                                }
+						}
-                            }
+					}
-                        });
+				});
-    }
+	}
-    public static String normalizeFullName(final String fullname) {
+	public static String normalizeFullName(final String fullname) {
-        return nfd(fullname)
+		return nfd(fullname)
-                .toLowerCase()
+			.toLowerCase()
-                // do not compact the regexes in a single expression, would cause StackOverflowError
+			// do not compact the regexes in a single expression, would cause StackOverflowError
-                // in case
+			// in case
-                // of large input strings
+			// of large input strings
-                .replaceAll("(\\W)+", " ")
+			.replaceAll("(\\W)+", " ")
-                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-                .replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\p{Punct})+", " ")
-                .replaceAll("(\\d)+", " ")
+			.replaceAll("(\\d)+", " ")
-                .replaceAll("(\\n)+", " ")
+			.replaceAll("(\\n)+", " ")
-                .trim();
+			.trim();
-    }
+	}
 	private static String authorFieldToBeCompared(Author author) {
 		if (StringUtils.isNotBlank(author.getSurname())) {
 			return author.getSurname();
-    private static String authorFieldToBeCompared(Author author) {
+		}
-        if (StringUtils.isNotBlank(author.getSurname())) {
+		if (StringUtils.isNotBlank(author.getFullname())) {
-            return author.getSurname();
+			return author.getFullname();
 		}
 		return null;
 	}
-        }
+	/**
-        if (StringUtils.isNotBlank(author.getFullname())) {
+	 * This method tries to figure out when two author are the same in the contest
-            return author.getFullname();
+	 * of ORCID enrichment
-        }
+	 *
-        return null;
+	 * @param left  Author in the OAF entity
-    }
+	 * @param right Author ORCID
 	 * @return based on a heuristic on the names of the authors if they are the same.
 	 */
 	public static boolean checkORCIDSimilarity(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);
-    /**
+		// If one of them didn't have a surname we verify if they have the fullName not empty
-     * This method tries to figure out when two author are the same in the contest
+		// and verify if the normalized version is equal
-     * of ORCID enrichment
+		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
-     *
+			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
     * @param left  Author in the OAF entity
     * @param right Author ORCID
     * @return based on a heuristic on the names of the authors if they are the same.
     */
    public static boolean checkORCIDSimilarity(final Author left, final Author right) {
        final Person pl = parse(left);
        final Person pr = parse(right);
-        // If one of them didn't have a surname we verify if they have the fullName not empty
+			if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
-        // and verify if the normalized version is equal
+				&& !pr.getFullname().isEmpty()) {
-        if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
+				return pl
-                pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
+					.getFullname()
 					.stream()
 					.anyMatch(
 						fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
 			} else {
 				return false;
 			}
 		}
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
-            if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
+			// If one of them has only a surname and is the same we can say that they are the same author
-                    && !pr.getFullname().isEmpty()) {
+			if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
-                return pl
+				(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
-                        .getFullname()
+				return true;
-                        .stream()
+			// The authors have the same initials of Name in common
-                        .anyMatch(
+			if (pl
-                                fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
+				.getName()
-            } else {
+				.stream()
-                return false;
+				.anyMatch(
-            }
+					nl -> pr
-        }
+						.getName()
-        // The Authors have one surname in common
+						.stream()
-        if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
+						.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
 				return true;
 		}
-            // If one of them has only a surname and is the same we can say that they are the same author
+		// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
-            if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
+		// We verify if we have an exact match between name and surname
-                    (pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
+		if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
-                return true;
+			pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
-            // The authors have the same initials of Name in common
+			return true;
-            if (pl
+		else
-                    .getName()
+			return false;
-                    .stream()
+	}
-                    .anyMatch(
+	//
                            nl -> pr
                                    .getName()
                                    .stream()
                                    .anyMatch(nr -> nr.equalsIgnoreCase(nl))))
                return true;
        }
-        // Sometimes we noticed that publication have author wrote in inverse order Surname, Name
+	/**
-        // We verify if we have an exact match between name and surname
+	 * Method to enrich ORCID information in one list of authors based on another list
-        if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
+	 *
-                pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
+	 * @param baseAuthor  the Author List in the OAF Entity
-            return true;
+	 * @param orcidAuthor The list of ORCID Author intersected
-        else
+	 * @return The Author List of the OAF Entity enriched with the orcid Author
-            return false;
+	 */
-    }
+	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
    //
-    /**
+		if (baseAuthor == null || baseAuthor.isEmpty())
-     * Method to enrich ORCID information in one list of authors based on another list
+			return orcidAuthor;
     *
     * @param baseAuthor  the Author List in the OAF Entity
     * @param orcidAuthor The list of ORCID Author intersected
     * @return The Author List of the OAF Entity enriched with the orcid Author
     */
    public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
-        if (baseAuthor == null || baseAuthor.isEmpty())
+		if (orcidAuthor == null || orcidAuthor.isEmpty())
-            return orcidAuthor;
+			return baseAuthor;
-        if (orcidAuthor == null || orcidAuthor.isEmpty())
+		if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
-            return baseAuthor;
+			return baseAuthor;
-        if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
+		final List<Author> oAuthor = new ArrayList<>();
-            return baseAuthor;
+		oAuthor.addAll(orcidAuthor);
-        final List<Author> oAuthor = new ArrayList<>();
+		baseAuthor.forEach(ba -> {
-        oAuthor.addAll(orcidAuthor);
+			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
 			if (aMatch.isPresent()) {
 				final Author sameAuthor = aMatch.get();
 				addPid(ba, sameAuthor.getPid());
 				oAuthor.remove(sameAuthor);
 			}
 		});
 		return baseAuthor;
 	}
-        baseAuthor.forEach(ba -> {
+	private static void addPid(final Author a, final List<StructuredProperty> pids) {
            Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
            if (aMatch.isPresent()) {
                final Author sameAuthor = aMatch.get();
                addPid(ba, sameAuthor.getPid());
                oAuthor.remove(sameAuthor);
            }
        });
        return baseAuthor;
    }
-    private static void addPid(final Author a, final List<StructuredProperty> pids) {
+		if (a.getPid() == null) {
 			a.setPid(new ArrayList<>());
 		}
-        if (a.getPid() == null) {
+		a.getPid().addAll(pids);
            a.setPid(new ArrayList<>());
        }
-        a.getPid().addAll(pids);
+	}
-    }
+	public static String pidToComparableString(StructuredProperty pid) {
 		final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
 			: "";
 		return (pid.getQualifier() != null ? classid : "")
 			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
 	}
-    public static String pidToComparableString(StructuredProperty pid) {
+	public static int countAuthorsPids(List<Author> authors) {
-        final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
+		if (authors == null)
-                : "";
+			return 0;
        return (pid.getQualifier() != null ? classid : "")
                + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
    }
-    public static int countAuthorsPids(List<Author> authors) {
+		return (int) authors.stream().filter(AuthorMerger::hasPid).count();
-        if (authors == null)
+	}
            return 0;
-        return (int) authors.stream().filter(AuthorMerger::hasPid).count();
+	private static int authorsSize(List<Author> authors) {
-    }
+		if (authors == null)
 			return 0;
 		return authors.size();
 	}
-    private static int authorsSize(List<Author> authors) {
+	private static Double sim(Author a, Author b) {
        if (authors == null)
            return 0;
        return authors.size();
    }
-    private static Double sim(Author a, Author b) {
+		final Person pa = parse(a);
 		final Person pb = parse(b);
-        final Person pa = parse(a);
+		// if both are accurate (e.g. they have name and surname)
-        final Person pb = parse(b);
+		if (pa.isAccurate() & pb.isAccurate()) {
 			return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
 				+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
 		} else {
 			return new JaroWinkler()
 				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
 		}
 	}
-        // if both are accurate (e.g. they have name and surname)
+	private static boolean hasPid(Author a) {
-        if (pa.isAccurate() & pb.isAccurate()) {
+		if (a == null || a.getPid() == null || a.getPid().isEmpty())
-            return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
+			return false;
-                    + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
+		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
-        } else {
+	}
            return new JaroWinkler()
                    .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
        }
    }
-    private static boolean hasPid(Author a) {
+	private static Person parse(Author author) {
-        if (a == null || a.getPid() == null || a.getPid().isEmpty())
+		if (StringUtils.isNotBlank(author.getSurname())) {
-            return false;
+			return new Person(author.getSurname() + ", " + author.getName(), false);
-        return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+		} else {
-    }
+			if (StringUtils.isNotBlank(author.getFullname()))
 				return new Person(author.getFullname(), false);
 			else
 				return new Person("", false);
 		}
 	}
-    private static Person parse(Author author) {
+	public static String normalize(final String s) {
-        if (StringUtils.isNotBlank(author.getSurname())) {
+		String[] normalized = nfd(s)
-            return new Person(author.getSurname() + ", " + author.getName(), false);
+			.toLowerCase()
-        } else {
+			// do not compact the regexes in a single expression, would cause StackOverflowError
-            if (StringUtils.isNotBlank(author.getFullname()))
+			// in case
-                return new Person(author.getFullname(), false);
+			// of large input strings
-            else
+			.replaceAll("(\\W)+", " ")
-                return new Person("", false);
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-        }
+			.replaceAll("(\\p{Punct})+", " ")
-    }
+			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim()
 			.split(" ");
-    public static String normalize(final String s) {
+		Arrays.sort(normalized);
        String[] normalized = nfd(s)
                .toLowerCase()
                // do not compact the regexes in a single expression, would cause StackOverflowError
                // in case
                // of large input strings
                .replaceAll("(\\W)+", " ")
                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
                .replaceAll("(\\p{Punct})+", " ")
                .replaceAll("(\\d)+", " ")
                .replaceAll("(\\n)+", " ")
                .trim()
                .split(" ");
-        Arrays.sort(normalized);
+		return String.join(" ", normalized);
 	}
-        return String.join(" ", normalized);
+	private static String nfd(final String s) {
-    }
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
-
+	}
    private static String nfd(final String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }
 }