From f64f5d9e231a48182edea10d99c956f4df410935 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 5 Jul 2021 16:24:47 +0200 Subject: [PATCH] first implementation and test class for the specific Author Merger for doiboost. First change: crossref as base to be enriched. Modified the normalization function to remove accents from words --- .../doiboost/DoiBoostAuthorMerger.java | 175 ++++++++++++++++++ .../doiboost/DoiBoostAuthorMergerTest.java | 120 ++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java new file mode 100644 index 000000000..741df13ff --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java @@ -0,0 +1,175 @@ + +package eu.dnetlib.doiboost; + +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.oa.merge.AuthorMerger; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.model.Person; +import scala.Tuple2; + +public class DoiBoostAuthorMerger { + + private static final Double THRESHOLD = 0.95; + + public static List merge(List> authors) { + + Iterator> it = authors.iterator(); + final List author = it.next(); + + it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD)); + + return author; + + } + + public static List mergeAuthor(final List crossrefAuthor, final List otherAuthor, + Double threshold) { + + enrichPidFromList(crossrefAuthor, otherAuthor, threshold); + return crossrefAuthor; + } + + public static List mergeAuthor(final List crossrefAuthor, final List otherAuthor) { + return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD); + } + + private static void enrichPidFromList(List base, List enrich, Double threshold) { + if (base == null || enrich == null) + return; + + // (if an Author has more than 1 pid, it appears 2 times in the list) + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + + // (list of pid that are missing in the other list) + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); + + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + + if (simAuthor.isPresent()) { + double th = threshold; + // increase the threshold if the surname is too short + if (simAuthor.get()._2().getSurname() != null + && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) + th = 0.99; + + if (simAuthor.get()._1() > th) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } + + // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, + // it creates of fixed size, and the add method raise UnsupportedOperationException at + // java.util.AbstractList.add + final List tmp = new ArrayList<>(r.getPid()); + tmp.add(a._1()); + r.setPid(tmp); + } + } + }); + } + + public static String pidToComparableString(StructuredProperty pid) { + return (pid.getQualifier() != null + ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" + : "") + + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } + + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; + + return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count(); + } + + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } + + private static Double sim(Author a, Author b) { + + final Person pa = parse(a); + final Person pb = parse(b); + + // if both are accurate (e.g. they have name and surname) + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5 + + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5; + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } + + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } + + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } + + private static String normalize(final String s) { + String[] normalized = nfd(s) + .replaceAll("[^\\p{ASCII}]", "") + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim() + .split(" "); + + Arrays.sort(normalized); + + return String.join(" ", normalized); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java new file mode 100644 index 000000000..4779a3b3a --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java @@ -0,0 +1,120 @@ + +package eu.dnetlib.dhp.doiboost; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.doiboost.DoiBoostAuthorMerger; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +public class DoiBoostAuthorMergerTest { + + private String publicationsBasePath; + + private List> authors; + + @BeforeEach + public void setUp() throws Exception { + + publicationsBasePath = Paths + .get(DoiBoostAuthorMergerTest.class.getResource("/eu/dnetlib/dhp/doiboost").toURI()) + .toFile() + .getAbsolutePath(); + + authors = readSample(publicationsBasePath + "/matching_authors_first.json", Publication.class) + .stream() + .map(p -> p._2().getAuthor()) + .collect(Collectors.toList()); + + } + + @Test + public void mergeTest() { // used in the dedup: threshold set to 0.95 + + for (List authors1 : authors) { + System.out.println("List " + (authors.indexOf(authors1) + 1)); + for (Author author : authors1) { + System.out.println(authorToString(author)); + } + } + + List merge = DoiBoostAuthorMerger.merge(authors); + + System.out.println("Merge "); + for (Author author : merge) { + System.out.println(authorToString(author)); + } + + Assertions.assertEquals(10, merge.size()); + + Assertions.assertEquals(3, merge.stream().filter(a -> a.getPid() != null).count()); + + merge + .stream() + .filter(a -> a.getPid() != null) + .forEach( + a -> Assertions + .assertTrue( + a.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals(ModelConstants.ORCID)))); + merge.stream().filter(a -> a.getPid() != null).forEach(a -> { + try { + System.out.println(new ObjectMapper().writeValueAsString(a)); + } catch (JsonProcessingException e) { + e.printStackTrace(); + } + }); + + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res + .add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + + public String authorToString(Author a) { + + String print = "Fullname = "; + print += a.getFullname() + " pid = ["; + if (a.getPid() != null) + for (StructuredProperty sp : a.getPid()) { + print += sp.toComparableString() + " "; + } + print += "]"; + return print; + } +}