first implementation and test class for the specific Author Merger for doiboost. First change: crossref as base to be enriched. Modified the normalization function to remove accents from words

2021-07-05 16:24:47 +02:00 · 2021-07-05 16:24:47 +02:00 · f64f5d9e23
parent 238d692a0a
commit f64f5d9e23
2 changed files with 295 additions and 0 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostAuthorMerger.java
@ -0,0 +1,175 @@
 package eu.dnetlib.doiboost;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.model.Person;
 import scala.Tuple2;
 public class DoiBoostAuthorMerger {
 	private static final Double THRESHOLD = 0.95;
 	public static List<Author> merge(List<List<Author>> authors) {
 		Iterator<List<Author>> it = authors.iterator();
 		final List<Author> author = it.next();
 		it.forEachRemaining(autList -> enrichPidFromList(author, autList, THRESHOLD));
 		return author;
 	}
 	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor,
 		Double threshold) {
 		enrichPidFromList(crossrefAuthor, otherAuthor, threshold);
 		return crossrefAuthor;
 	}
 	public static List<Author> mergeAuthor(final List<Author> crossrefAuthor, final List<Author> otherAuthor) {
 		return mergeAuthor(crossrefAuthor, otherAuthor, THRESHOLD);
 	}
 	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
 		if (base == null || enrich == null)
 			return;
 		// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
 		final Map<String, Author> basePidAuthorMap = base
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
 			.flatMap(
 				a -> a
 					.getPid()
 					.stream()
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
 		// <pid, Author> (list of pid that are missing in the other list)
 		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
 			.flatMap(
 				a -> a
 					.getPid()
 					.stream()
 					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
 					.map(p -> new Tuple2<>(p, a)))
 			.collect(Collectors.toList());
 		pidToEnrich
 			.forEach(
 				a -> {
 					Optional<Tuple2<Double, Author>> simAuthor = base
 						.stream()
 						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
 						.max(Comparator.comparing(Tuple2::_1));
 					if (simAuthor.isPresent()) {
 						double th = threshold;
 						// increase the threshold if the surname is too short
 						if (simAuthor.get()._2().getSurname() != null
 							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
 							th = 0.99;
 						if (simAuthor.get()._1() > th) {
 							Author r = simAuthor.get()._2();
 							if (r.getPid() == null) {
 								r.setPid(new ArrayList<>());
 							}
 							// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
 							// it creates of fixed size, and the add method raise UnsupportedOperationException at
 							// java.util.AbstractList.add
 							final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
 							tmp.add(a._1());
 							r.setPid(tmp);
 						}
 					}
 				});
 	}
 	public static String pidToComparableString(StructuredProperty pid) {
 		return (pid.getQualifier() != null
 			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
 			: "")
 			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
 	}
 	public static int countAuthorsPids(List<Author> authors) {
 		if (authors == null)
 			return 0;
 		return (int) authors.stream().filter(DoiBoostAuthorMerger::hasPid).count();
 	}
 	private static int authorsSize(List<Author> authors) {
 		if (authors == null)
 			return 0;
 		return authors.size();
 	}
 	private static Double sim(Author a, Author b) {
 		final Person pa = parse(a);
 		final Person pb = parse(b);
 		// if both are accurate (e.g. they have name and surname)
 		if (pa.isAccurate() & pb.isAccurate()) {
 			return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
 				+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
 		} else {
 			return new JaroWinkler()
 				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
 		}
 	}
 	private static boolean hasPid(Author a) {
 		if (a == null || a.getPid() == null || a.getPid().size() == 0)
 			return false;
 		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
 	}
 	private static Person parse(Author author) {
 		if (StringUtils.isNotBlank(author.getSurname())) {
 			return new Person(author.getSurname() + ", " + author.getName(), false);
 		} else {
 			return new Person(author.getFullname(), false);
 		}
 	}
 	private static String normalize(final String s) {
 		String[] normalized = nfd(s)
 			.replaceAll("[^\\p{ASCII}]", "")
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
 			// of large input strings
 			.replaceAll("(\\W)+", " ")
 			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim()
 			.split(" ");
 		Arrays.sort(normalized);
 		return String.join(" ", normalized);
 	}
 	private static String nfd(final String s) {
 		return Normalizer.normalize(s, Normalizer.Form.NFD);
 	}
 }
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostAuthorMergerTest.java
@ -0,0 +1,120 @@
 package eu.dnetlib.dhp.doiboost;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.doiboost.DoiBoostAuthorMerger;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import scala.Tuple2;
 public class DoiBoostAuthorMergerTest {
 	private String publicationsBasePath;
 	private List<List<Author>> authors;
 	@BeforeEach
 	public void setUp() throws Exception {
 		publicationsBasePath = Paths
 			.get(DoiBoostAuthorMergerTest.class.getResource("/eu/dnetlib/dhp/doiboost").toURI())
 			.toFile()
 			.getAbsolutePath();
 		authors = readSample(publicationsBasePath + "/matching_authors_first.json", Publication.class)
 			.stream()
 			.map(p -> p._2().getAuthor())
 			.collect(Collectors.toList());
 	}
 	@Test
 	public void mergeTest() { // used in the dedup: threshold set to 0.95
 		for (List<Author> authors1 : authors) {
 			System.out.println("List " + (authors.indexOf(authors1) + 1));
 			for (Author author : authors1) {
 				System.out.println(authorToString(author));
 			}
 		}
 		List<Author> merge = DoiBoostAuthorMerger.merge(authors);
 		System.out.println("Merge ");
 		for (Author author : merge) {
 			System.out.println(authorToString(author));
 		}
 		Assertions.assertEquals(10, merge.size());
 		Assertions.assertEquals(3, merge.stream().filter(a -> a.getPid() != null).count());
 		merge
 			.stream()
 			.filter(a -> a.getPid() != null)
 			.forEach(
 				a -> Assertions
 					.assertTrue(
 						a.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals(ModelConstants.ORCID))));
 		merge.stream().filter(a -> a.getPid() != null).forEach(a -> {
 			try {
 				System.out.println(new ObjectMapper().writeValueAsString(a));
 			} catch (JsonProcessingException e) {
 				e.printStackTrace();
 			}
 		});
 	}
 	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
 		List<Tuple2<String, T>> res = new ArrayList<>();
 		BufferedReader reader;
 		try {
 			reader = new BufferedReader(new FileReader(path));
 			String line = reader.readLine();
 			while (line != null) {
 				res
 					.add(
 						new Tuple2<>(
 							MapDocumentUtil.getJPathString("$.id", line),
 							new ObjectMapper().readValue(line, clazz)));
 				// read next line
 				line = reader.readLine();
 			}
 			reader.close();
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 		return res;
 	}
 	public String authorToString(Author a) {
 		String print = "Fullname = ";
 		print += a.getFullname() + " pid = [";
 		if (a.getPid() != null)
 			for (StructuredProperty sp : a.getPid()) {
 				print += sp.toComparableString() + " ";
 			}
 		print += "]";
 		return print;
 	}
 }