first implementation and test class for the specific Author Merger for doiboost. First change: crossref as base to be enriched. Modified the normalization function to remove accents from words

2021-07-05 16:24:47 +02:00 · 2021-07-05 16:24:47 +02:00 · f64f5d9e23
parent 238d692a0a
commit f64f5d9e23
2 changed files with 295 additions and 0 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
@ -0,0 +1,175 @@
+
+package eu.dnetlib.doiboost;
+
+import java.text.Normalizer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.wcohen.ss.JaroWinkler;
+
+import eu.dnetlib.dhp.oa.merge.AuthorMerger;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.model.Person;
+import scala.Tuple2;
+
+public class DoiBoostAuthorMerger {
+
+	private static final Double THRESHOLD = 0.95;
+
+	public static List<Author> merge(List<List<Author>> authors) {
+
+		Iterator<List<Author>> it = authors.iterator();
+		final List<Author> author = it.next();
+
+		it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD));
+
+		return author;
+
+	}
+
+	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor,
+		Double threshold) {
+
+		enrichPidFromList(crossrefAuthor, otherAuthor, threshold);
+		return crossrefAuthor;
+	}
+
+	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor) {
+		return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD);
+	}
+
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
+		if (base == null || enrich == null)
+			return;
+
+		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
+		final Map<String, Author> basePidAuthorMap = base
+			.stream()
+			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
+			.flatMap(
+				a -> a
+					.getPid()
+					.stream()
+					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
+			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+
+		// <pid, Author> (list of pid that are missing in the other list)
+		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+			.stream()
+			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
+			.flatMap(
+				a -> a
+					.getPid()
+					.stream()
+					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
+					.map(p -> new Tuple2<>(p, a)))
+			.collect(Collectors.toList());
+
+		pidToEnrich
+			.forEach(
+				a -> {
+					Optional<Tuple2<Double, Author>> simAuthor = base
+						.stream()
+						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
+						.max(Comparator.comparing(Tuple2::_1));
+
+					if (simAuthor.isPresent()) {
+						double th = threshold;
+						// increase the threshold if the surname is too short
+						if (simAuthor.get()._2().getSurname() != null
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
+							th = 0.99;
+
+						if (simAuthor.get()._1() > th) {
+							Author r = simAuthor.get()._2();
+							if (r.getPid() == null) {
+								r.setPid(new ArrayList<>());
+							}
+
+							// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
+							// it creates of fixed size, and the add method raise UnsupportedOperationException at
+							// java.util.AbstractList.add
+							final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
+							tmp.add(a._1());
+							r.setPid(tmp);
+						}
+					}
+				});
+	}
+
+	public static String pidToComparableString(StructuredProperty pid) {
+		return (pid.getQualifier() != null
+			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
+			: "")
+			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
+	}
+
+	public static int countAuthorsPids(List<Author> authors) {
+		if (authors == null)
+			return 0;
+
+		return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count();
+	}
+
+	private static int authorsSize(List<Author> authors) {
+		if (authors == null)
+			return 0;
+		return authors.size();
+	}
+
+	private static Double sim(Author a, Author b) {
+
+		final Person pa = parse(a);
+		final Person pb = parse(b);
+
+		// if both are accurate (e.g. they have name and surname)
+		if (pa.isAccurate() & pb.isAccurate()) {
+			return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
+				+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
+		} else {
+			return new JaroWinkler()
+				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
+		}
+	}
+
+	private static boolean hasPid(Author a) {
+		if (a == null || a.getPid() == null || a.getPid().size() == 0)
+			return false;
+		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+	}
+
+	private static Person parse(Author author) {
+		if (StringUtils.isNotBlank(author.getSurname())) {
+			return new Person(author.getSurname() + ", " + author.getName(), false);
+		} else {
+			return new Person(author.getFullname(), false);
+		}
+	}
+
+	private static String normalize(final String s) {
+		String[] normalized = nfd(s)
+			.replaceAll("[^\\p{ASCII}]", "")
+			.toLowerCase()
+			// do not compact the regexes in a single expression, would cause StackOverflowError
+			// in case
+			// of large input strings
+			.replaceAll("(\\W)+", " ")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+			.replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\d)+", " ")
+			.replaceAll("(\\n)+", " ")
+			.trim()
+			.split(" ");
+
+		Arrays.sort(normalized);
+
+		return String.join(" ", normalized);
+	}
+
+	private static String nfd(final String s) {
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java
@ -0,0 +1,120 @@
+
+package eu.dnetlib.dhp.doiboost;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.doiboost.DoiBoostAuthorMerger;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import scala.Tuple2;
+
+public class DoiBoostAuthorMergerTest {
+
+	private String publicationsBasePath;
+
+	private List<List<Author>> authors;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+
+		publicationsBasePath = Paths
+			.get(DoiBoostAuthorMergerTest.class.getResource("/eu/dnetlib/dhp/doiboost").toURI())
+			.toFile()
+			.getAbsolutePath();
+
+		authors = readSample(publicationsBasePath + "/matching_authors_first.json", Publication.class)
+			.stream()
+			.map(p -> p._2().getAuthor())
+			.collect(Collectors.toList());
+
+	}
+
+	@Test
+	public void mergeTest() { // used in the dedup: threshold set to 0.95
+
+		for (List<Author> authors1 : authors) {
+			System.out.println("List " + (authors.indexOf(authors1) + 1));
+			for (Author author : authors1) {
+				System.out.println(authorToString(author));
+			}
+		}
+
+		List<Author> merge = DoiBoostAuthorMerger.merge(authors);
+
+		System.out.println("Merge ");
+		for (Author author : merge) {
+			System.out.println(authorToString(author));
+		}
+
+		Assertions.assertEquals(10, merge.size());
+
+		Assertions.assertEquals(3, merge.stream().filter(a -> a.getPid() != null).count());
+
+		merge
+			.stream()
+			.filter(a -> a.getPid() != null)
+			.forEach(
+				a -> Assertions
+					.assertTrue(
+						a.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals(ModelConstants.ORCID))));
+		merge.stream().filter(a -> a.getPid() != null).forEach(a -> {
+			try {
+				System.out.println(new ObjectMapper().writeValueAsString(a));
+			} catch (JsonProcessingException e) {
+				e.printStackTrace();
+			}
+		});
+
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res
+					.add(
+						new Tuple2<>(
+							MapDocumentUtil.getJPathString("$.id", line),
+							new ObjectMapper().readValue(line, clazz)));
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+	public String authorToString(Author a) {
+
+		String print = "Fullname = ";
+		print += a.getFullname() + " pid = [";
+		if (a.getPid() != null)
+			for (StructuredProperty sp : a.getPid()) {
+				print += sp.toComparableString() + " ";
+			}
+		print += "]";
+		return print;
+	}
+}