moved AuthoreMerger into dhp-common

2020-10-08 10:33:55 +02:00 · 2020-10-08 10:33:55 +02:00 · eec418cd26
parent fe0a7870e6
commit eec418cd26
7 changed files with 183 additions and 179 deletions
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -92,6 +92,17 @@
 			<groupId>com.squareup.okhttp3</groupId>
 			<artifactId>okhttp</artifactId>
 		</dependency>
+
+		<dependency>
+			<groupId>eu.dnetlib</groupId>
+			<artifactId>dnet-pace-core</artifactId>
+		</dependency>
+
+		<dependency>
+			<groupId>eu.dnetlib.dhp</groupId>
+			<artifactId>dhp-schemas</artifactId>
+			<version>${project.version}</version>
+		</dependency>
 	</dependencies>

 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -0,0 +1,168 @@
+package eu.dnetlib.dhp.oa.merge;
+import java.text.Normalizer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.wcohen.ss.JaroWinkler;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.model.Person;
+import scala.Tuple2;
+
+public class AuthorMerger {
+
+    private static final Double THRESHOLD = 0.95;
+
+    public static List<Author> merge(List<List<Author>> authors) {
+
+        authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
+
+        List<Author> author = new ArrayList<>();
+
+        for (List<Author> a : authors) {
+            author = mergeAuthor(author, a);
+        }
+
+        return author;
+
+    }
+
+    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+        int pa = countAuthorsPids(a);
+        int pb = countAuthorsPids(b);
+        List<Author> base, enrich;
+        int sa = authorsSize(a);
+        int sb = authorsSize(b);
+
+        if (pa == pb) {
+            base = sa > sb ? a : b;
+            enrich = sa > sb ? b : a;
+        } else {
+            base = pa > pb ? a : b;
+            enrich = pa > pb ? b : a;
+        }
+        enrichPidFromList(base, enrich);
+        return base;
+    }
+
+    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+        if (base == null || enrich == null)
+            return;
+        final Map<String, Author> basePidAuthorMap = base
+                .stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(
+                        a -> a
+                                .getPid()
+                                .stream()
+                                .map(p -> new Tuple2<>(pidToComparableString(p), a)))
+                .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+
+        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+                .stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(
+                        a -> a
+                                .getPid()
+                                .stream()
+                                .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
+                                .map(p -> new Tuple2<>(p, a)))
+                .collect(Collectors.toList());
+
+        pidToEnrich
+                .forEach(
+                        a -> {
+                            Optional<Tuple2<Double, Author>> simAuthor = base
+                                    .stream()
+                                    .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
+                                    .max(Comparator.comparing(Tuple2::_1));
+
+                            if (simAuthor.isPresent()) {
+                                double th = THRESHOLD;
+                                // increase the threshold if the surname is too short
+                                if (simAuthor.get()._2().getSurname() != null
+                                        && simAuthor.get()._2().getSurname().length() <= 3)
+                                    th = 0.99;
+
+                                if (simAuthor.get()._1() > th) {
+                                    Author r = simAuthor.get()._2();
+                                    if (r.getPid() == null) {
+                                        r.setPid(new ArrayList<>());
+                                    }
+                                    r.getPid().add(a._1());
+                                }
+                            }
+                        });
+    }
+
+    public static String pidToComparableString(StructuredProperty pid) {
+        return (pid.getQualifier() != null
+                ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
+                : "")
+                + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
+    }
+
+    public static int countAuthorsPids(List<Author> authors) {
+        if (authors == null)
+            return 0;
+
+        return (int) authors.stream().filter(AuthorMerger::hasPid).count();
+    }
+
+    private static int authorsSize(List<Author> authors) {
+        if (authors == null)
+            return 0;
+        return authors.size();
+    }
+
+    private static Double sim(Author a, Author b) {
+
+        final Person pa = parse(a);
+        final Person pb = parse(b);
+
+        // if both are accurate (e.g. they have name and surname)
+        if (pa.isAccurate() & pb.isAccurate()) {
+            return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
+                    + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
+        } else {
+            return new JaroWinkler()
+                    .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
+        }
+    }
+
+    private static boolean hasPid(Author a) {
+        if (a == null || a.getPid() == null || a.getPid().size() == 0)
+            return false;
+        return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+    }
+
+    private static Person parse(Author author) {
+        if (StringUtils.isNotBlank(author.getSurname())) {
+            return new Person(author.getSurname() + ", " + author.getName(), false);
+        } else {
+            return new Person(author.getFullname(), false);
+        }
+    }
+
+    private static String normalize(final String s) {
+        return nfd(s)
+                .toLowerCase()
+                // do not compact the regexes in a single expression, would cause StackOverflowError
+                // in case
+                // of large input strings
+                .replaceAll("(\\W)+", " ")
+                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+                .replaceAll("(\\p{Punct})+", " ")
+                .replaceAll("(\\d)+", " ")
+                .replaceAll("(\\n)+", " ")
+                .trim();
+    }
+
+    private static String nfd(final String s) {
+        return Normalizer.normalize(s, Normalizer.Form.NFD);
+    }
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
@ -1,170 +0,0 @@
-
-package eu.dnetlib.dhp.oa.dedup;
-
-import java.text.Normalizer;
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang3.StringUtils;
-
-import com.wcohen.ss.JaroWinkler;
-
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.pace.model.Person;
-import scala.Tuple2;
-
-public class AuthorMerger {
-
-	private static final Double THRESHOLD = 0.95;
-
-	public static List<Author> merge(List<List<Author>> authors) {
-
-		authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
-
-		List<Author> author = new ArrayList<>();
-
-		for (List<Author> a : authors) {
-			author = mergeAuthor(author, a);
-		}
-
-		return author;
-
-	}
-
-	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
-		int pa = countAuthorsPids(a);
-		int pb = countAuthorsPids(b);
-		List<Author> base, enrich;
-		int sa = authorsSize(a);
-		int sb = authorsSize(b);
-
-		if (pa == pb) {
-			base = sa > sb ? a : b;
-			enrich = sa > sb ? b : a;
-		} else {
-			base = pa > pb ? a : b;
-			enrich = pa > pb ? b : a;
-		}
-		enrichPidFromList(base, enrich);
-		return base;
-	}
-
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
-		if (base == null || enrich == null)
-			return;
-		final Map<String, Author> basePidAuthorMap = base
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(
-				a -> a
-					.getPid()
-					.stream()
-					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
-			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-
-		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
-			.stream()
-			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
-			.flatMap(
-				a -> a
-					.getPid()
-					.stream()
-					.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
-					.map(p -> new Tuple2<>(p, a)))
-			.collect(Collectors.toList());
-
-		pidToEnrich
-			.forEach(
-				a -> {
-					Optional<Tuple2<Double, Author>> simAuthor = base
-						.stream()
-						.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
-						.max(Comparator.comparing(Tuple2::_1));
-
-					if (simAuthor.isPresent()) {
-						double th = THRESHOLD;
-						// increase the threshold if the surname is too short
-						if (simAuthor.get()._2().getSurname() != null
-							&& simAuthor.get()._2().getSurname().length() <= 3)
-							th = 0.99;
-
-						if (simAuthor.get()._1() > th) {
-							Author r = simAuthor.get()._2();
-							if (r.getPid() == null) {
-								r.setPid(new ArrayList<>());
-							}
-							r.getPid().add(a._1());
-						}
-					}
-				});
-	}
-
-	public static String pidToComparableString(StructuredProperty pid) {
-		return (pid.getQualifier() != null
-			? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
-			: "")
-			+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
-	}
-
-	public static int countAuthorsPids(List<Author> authors) {
-		if (authors == null)
-			return 0;
-
-		return (int) authors.stream().filter(AuthorMerger::hasPid).count();
-	}
-
-	private static int authorsSize(List<Author> authors) {
-		if (authors == null)
-			return 0;
-		return authors.size();
-	}
-
-	private static Double sim(Author a, Author b) {
-
-		final Person pa = parse(a);
-		final Person pb = parse(b);
-
-		// if both are accurate (e.g. they have name and surname)
-		if (pa.isAccurate() & pb.isAccurate()) {
-			return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
-				+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
-		} else {
-			return new JaroWinkler()
-				.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
-		}
-	}
-
-	private static boolean hasPid(Author a) {
-		if (a == null || a.getPid() == null || a.getPid().size() == 0)
-			return false;
-		return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
-	}
-
-	private static Person parse(Author author) {
-		if (StringUtils.isNotBlank(author.getSurname())) {
-			return new Person(author.getSurname() + ", " + author.getName(), false);
-		} else {
-			return new Person(author.getFullname(), false);
-		}
-	}
-
-	private static String normalize(final String s) {
-		return nfd(s)
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError
-			// in case
-			// of large input strings
-			.replaceAll("(\\W)+", " ")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	private static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
-}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -1,11 +1,12 @@

 package eu.dnetlib.dhp.oa.dedup;

-import java.io.Serializable;
+
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;

+import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -10,6 +10,7 @@ import java.io.Serializable;
 import java.nio.file.Paths;
 import java.util.*;

+import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -83,13 +83,6 @@
            <artifactId>dhp-schemas</artifactId>
            <version>${project.version}</version>
        </dependency>
-
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-dedup-openaire</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@ -1,5 +1,5 @@
 package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.oa.dedup.AuthorMerger
+import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
 import org.apache.spark.sql.{Encoder, Encoders}