Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

2020-05-21 16:48:20 +02:00 · 2020-05-21 16:48:20 +02:00 · c7ca3cf35b
parent 3e34517479 8b35e0e7f0
commit c7ca3cf35b
6 changed files with 313 additions and 179 deletions
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
@ -0,0 +1,159 @@
+package eu.dnetlib.dhp.oa.dedup;
+
+import com.wcohen.ss.JaroWinkler;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+import scala.Tuple2;
+
+import java.text.Normalizer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class AuthorMerger {
+
+    private static final Double THRESHOLD = 0.95;
+
+    public static List<Author> merge(List<List<Author>> authors){
+
+        authors.sort(new Comparator<List<Author>>() {
+            @Override
+            public int compare(List<Author> o1, List<Author> o2) {
+                return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
+            }
+        });
+
+        List<Author> author = new ArrayList<>();
+
+        for(List<Author> a : authors){
+            author = mergeAuthor(author, a);
+        }
+
+        return author;
+
+    }
+
+    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+        int pa = countAuthorsPids(a);
+        int pb = countAuthorsPids(b);
+        List<Author> base, enrich;
+        int sa = authorsSize(a);
+        int sb = authorsSize(b);
+
+        if (pa == pb) {
+            base = sa > sb ? a : b;
+            enrich = sa > sb ? b : a;
+        } else {
+            base = pa > pb ? a : b;
+            enrich = pa > pb ? b : a;
+        }
+        enrichPidFromList(base, enrich);
+        return base;
+    }
+
+    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+        if (base == null || enrich == null)
+            return;
+        final Map<String, Author> basePidAuthorMap = base
+                .stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(
+                        a -> a
+                                .getPid()
+                                .stream()
+                                .map(p -> new Tuple2<>(pidToComparableString(p), a)))
+                .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+
+        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+                .stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(
+                        a -> a
+                                .getPid()
+                                .stream()
+                                .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
+                                .map(p -> new Tuple2<>(p, a)))
+                .collect(Collectors.toList());
+
+        pidToEnrich
+                .forEach(
+                        a -> {
+                            Optional<Tuple2<Double, Author>> simAuthor = base
+                                    .stream()
+                                    .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
+                                    .max(Comparator.comparing(Tuple2::_1));
+                            if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
+                                Author r = simAuthor.get()._2();
+                                if (r.getPid() == null) {
+                                    r.setPid(new ArrayList<>());
+                                }
+                                r.getPid().add(a._1());
+                            }
+                        });
+    }
+
+    public static String pidToComparableString(StructuredProperty pid){
+        return (pid.getQualifier()!=null? pid.getQualifier().getClassid()!=null?pid.getQualifier().getClassid().toLowerCase():"":"") + (pid.getValue()!=null? pid.getValue().toLowerCase():"");
+    }
+
+    public static int countAuthorsPids(List<Author> authors) {
+        if (authors == null)
+            return 0;
+
+        return (int) authors.stream().filter(AuthorMerger::hasPid).count();
+    }
+
+    private static int authorsSize(List<Author> authors) {
+        if (authors == null)
+            return 0;
+        return authors.size();
+    }
+
+    private static Double sim(Author a, Author b) {
+
+        final Person pa = parse(a);
+        final Person pb = parse(b);
+
+        if (pa.isAccurate() & pb.isAccurate()) {
+            return new JaroWinkler()
+                    .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
+        } else {
+            return new JaroWinkler()
+                    .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
+        }
+    }
+
+    private static boolean hasPid(Author a) {
+        if (a == null || a.getPid() == null || a.getPid().size() == 0)
+            return false;
+        return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+    }
+
+    private static Person parse(Author author) {
+        if (StringUtils.isNotBlank(author.getSurname())) {
+            return new Person(author.getSurname() + ", " + author.getName(), false);
+        } else {
+            return new Person(author.getFullname(), false);
+        }
+    }
+
+    private static String normalize(final String s) {
+        return nfd(s)
+                .toLowerCase()
+                // do not compact the regexes in a single expression, would cause StackOverflowError
+                // in case
+                // of large input strings
+                .replaceAll("(\\W)+", " ")
+                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+                .replaceAll("(\\p{Punct})+", " ")
+                .replaceAll("(\\d)+", " ")
+                .replaceAll("(\\n)+", " ")
+                .trim();
+    }
+
+    private static String nfd(final String s) {
+        return Normalizer.normalize(s, Normalizer.Form.NFD);
+    }
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -1,8 +1,9 @@
-
 package eu.dnetlib.dhp.oa.dedup;

+import java.io.Serializable;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.List;

 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
@ -67,16 +68,18 @@ public class DedupRecordFactory {
 				(MapFunction<Tuple2<String, T>, String>) entity -> entity._1(), Encoders.STRING())
 			.mapGroups(
 				(MapGroupsFunction<String, Tuple2<String, T>, T>) (key,
-					values) -> entityMerger(key, values, ts, dataInfo),
+					values) -> entityMerger(key, values, ts, dataInfo, clazz),
 				Encoders.bean(clazz));
 	}

-	private static <T extends OafEntity> T entityMerger(
-		String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo) {
+	public static <T extends OafEntity> T entityMerger(
+		String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) throws IllegalAccessException, InstantiationException {

-		T entity = entities.next()._2();
+		T entity = clazz.newInstance();

 		final Collection<String> dates = Lists.newArrayList();
+		final List<List<Author>> authors = Lists.newArrayList();
+
 		entities
 			.forEachRemaining(
 				t -> {
@ -84,17 +87,17 @@ public class DedupRecordFactory {
 					entity.mergeFrom(duplicate);
 					if (ModelSupport.isSubClass(duplicate, Result.class)) {
 						Result r1 = (Result) duplicate;
-						Result er = (Result) entity;
-						er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor()));
-
-						if (r1.getDateofacceptance() != null) {
+						if (r1.getAuthor() != null && r1.getAuthor().size()>0)
+							authors.add(r1.getAuthor());
+						if (r1.getDateofacceptance() != null)
 							dates.add(r1.getDateofacceptance().getValue());
-						}
 					}
 				});

+		//set authors and date
 		if (ModelSupport.isSubClass(entity, Result.class)) {
 			((Result) entity).setDateofacceptance(DatePicker.pick(dates));
+			((Result) entity).setAuthor(AuthorMerger.merge(authors));
 		}

 		entity.setId(id);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
@ -32,7 +32,6 @@ import eu.dnetlib.pace.model.Person;
 import scala.Tuple2;

 public class DedupUtility {
-	private static final Double THRESHOLD = 0.95;

 	public static Map<String, LongAccumulator> constructAccumulator(
 		final DedupConfig dedupConf, final SparkContext context) {
@ -82,61 +81,6 @@ public class DedupUtility {
 		}
 	}

-	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
-		int pa = countAuthorsPids(a);
-		int pb = countAuthorsPids(b);
-		List<Author> base, enrich;
-		int sa = authorsSize(a);
-		int sb = authorsSize(b);
-
-		if (pa == pb) {
-			base = sa > sb ? a : b;
-			enrich = sa > sb ? b : a;
-		} else {
-			base = pa > pb ? a : b;
-			enrich = pa > pb ? b : a;
-		}
-		enrichPidFromList(base, enrich);
-		return base;
-	}
-
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
-		if (base == null || enrich == null)
-			return;
-		final Map<String, Author> basePidAuthorMap = base
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a)))
-			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-
-		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(
-				a -> a
-					.getPid()
-					.stream()
-					.filter(p -> !basePidAuthorMap.containsKey(p.toComparableString()))
-					.map(p -> new Tuple2<>(p, a)))
-			.collect(Collectors.toList());
-
-		pidToEnrich
-			.forEach(
-				a -> {
-					Optional<Tuple2<Double, Author>> simAuhtor = base
-						.stream()
-						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
-						.max(Comparator.comparing(Tuple2::_1));
-					if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
-						Author r = simAuhtor.get()._2();
-						if (r.getPid() == null) {
-							r.setPid(new ArrayList<>());
-						}
-						r.getPid().add(a._1());
-					}
-				});
-	}
-
 	public static String createDedupRecordPath(
 		final String basePath, final String actionSetId, final String entityType) {
 		return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
@ -156,65 +100,6 @@ public class DedupUtility {
 		return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
 	}

-	private static Double sim(Author a, Author b) {
-
-		final Person pa = parse(a);
-		final Person pb = parse(b);
-
-		if (pa.isAccurate() & pb.isAccurate()) {
-			return new JaroWinkler()
-				.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
-		} else {
-			return new JaroWinkler()
-				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
-		}
-	}
-
-	private static String normalize(final String s) {
-		return nfd(s)
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError
-			// in case
-			// of large input strings
-			.replaceAll("(\\W)+", " ")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	private static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
-	private static Person parse(Author author) {
-		if (StringUtils.isNotBlank(author.getSurname())) {
-			return new Person(author.getSurname() + ", " + author.getName(), false);
-		} else {
-			return new Person(author.getFullname(), false);
-		}
-	}
-
-	private static int countAuthorsPids(List<Author> authors) {
-		if (authors == null)
-			return 0;
-
-		return (int) authors.stream().filter(DedupUtility::hasPid).count();
-	}
-
-	private static int authorsSize(List<Author> authors) {
-		if (authors == null)
-			return 0;
-		return authors.size();
-	}
-
-	private static boolean hasPid(Author a) {
-		if (a == null || a.getPid() == null || a.getPid().size() == 0)
-			return false;
-		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
-	}
-
 	public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator)
 		throws ISLookUpException, DocumentException {
 		final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -0,0 +1,138 @@
+package eu.dnetlib.dhp.oa.dedup;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.file.Paths;
+import java.util.*;
+
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.jupiter.api.BeforeEach;
+
+import org.junit.jupiter.api.Test;
+import scala.Tuple2;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class EntityMergerTest implements Serializable {
+
+	List<Tuple2<String, Publication>> publications;
+
+	String testEntityBasePath;
+	DataInfo dataInfo;
+	String dedupId = "dedup_id";
+	Publication pub_top;
+
+	@BeforeEach
+	public void setUp() throws Exception {
+
+		testEntityBasePath = Paths
+				.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
+				.toFile()
+				.getAbsolutePath();
+
+		publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
+
+		pub_top = getTopPub(publications);
+
+		dataInfo = setDI();
+
+	}
+
+	@Test
+	public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
+
+		Publication pub_merged = DedupRecordFactory.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
+
+		assertEquals(dedupId, pub_merged.getId());
+
+		assertEquals(pub_merged.getJournal(), pub_top.getJournal());
+		assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
+		assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
+		assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
+		assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
+		assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
+		assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
+		assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
+		assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
+		assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
+		assertEquals(pub_merged.getInstance().size(),3);
+		assertEquals(pub_merged.getCountry().size(), 2);
+		assertEquals(pub_merged.getSubject().size(), 0);
+		assertEquals(pub_merged.getTitle().size(), 2);
+		assertEquals(pub_merged.getRelevantdate().size(),0);
+		assertEquals(pub_merged.getDescription().size(),0);
+		assertEquals(pub_merged.getSource().size(),0);
+		assertEquals(pub_merged.getFulltext().size(),0);
+		assertEquals(pub_merged.getFormat().size(),0);
+		assertEquals(pub_merged.getContributor().size(),0);
+		assertEquals(pub_merged.getCoverage().size(),0);
+		assertEquals(pub_merged.getContext().size(),0);
+		assertEquals(pub_merged.getExternalReference().size(),0);
+		assertEquals(pub_merged.getOriginalId().size(),3);
+		assertEquals(pub_merged.getCollectedfrom().size(),3);
+		assertEquals(pub_merged.getPid().size(),1);
+		assertEquals(pub_merged.getExtraInfo().size(),0);
+
+		//verify datainfo
+		assertEquals(pub_merged.getDataInfo(), dataInfo);
+
+		//verify datepicker
+		assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
+
+		//verify authors
+		assertEquals(pub_merged.getAuthor().size(), 9);
+		assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
+	}
+
+	public DataInfo setDI(){
+		DataInfo dataInfo = new DataInfo();
+		dataInfo.setTrust("0.9");
+		dataInfo.setDeletedbyinference(false);
+		dataInfo.setInferenceprovenance("testing");
+		dataInfo.setInferred(true);
+		return dataInfo;
+	}
+
+	public Publication getTopPub(List<Tuple2<String, Publication>> publications){
+
+		Double maxTrust = 0.0;
+		Publication maxPub = new Publication();
+		for (Tuple2<String, Publication> publication : publications) {
+			Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
+			if(pubTrust > maxTrust){
+				maxTrust = pubTrust;
+				maxPub = publication._2();
+			}
+		}
+		return maxPub;
+	}
+
+	public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+		List<Tuple2<String, T>> res = new ArrayList<>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(path));
+			String line = reader.readLine();
+			while (line != null) {
+				res.add(
+						new Tuple2<>(
+								MapDocumentUtil.getJPathString("$.id", line),
+								new ObjectMapper().readValue(line, clazz))
+				);
+				// read next line
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+
+		return res;
+	}
+
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java
@ -1,54 +0,0 @@
-
-package eu.dnetlib.dhp.oa.dedup;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.io.IOUtils;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.junit.jupiter.api.BeforeEach;
-
-import eu.dnetlib.dhp.schema.oaf.Publication;
-
-public class MergeAuthorTest {
-
-	private List<Publication> publicationsToMerge;
-	private final ObjectMapper mapper = new ObjectMapper();
-
-	@BeforeEach
-	public void setUp() throws Exception {
-		final String json = IOUtils
-			.toString(
-				this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json"));
-
-		publicationsToMerge = Arrays
-			.asList(json.split("\n"))
-			.stream()
-			.map(
-				s -> {
-					try {
-						return mapper.readValue(s, Publication.class);
-					} catch (IOException e) {
-						throw new RuntimeException(e);
-					}
-				})
-			.collect(Collectors.toList());
-	}
-
-	// FIX ME Michele DB this tests doesn't work
-	// @Test
-	public void test() throws Exception {
-		Publication dedup = new Publication();
-
-		publicationsToMerge
-			.forEach(
-				p -> {
-					dedup.mergeFrom(p);
-					dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor()));
-				});
-
-		System.out.println(mapper.writeValueAsString(dedup));
-	}
-}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json