bug fix in the authormerge: now authors with higher size have priority, normalization of author name fixed

2020-12-21 17:51:42 +01:00 · 2020-12-21 17:51:42 +01:00 · 794e22b09c
parent 6cb0dc3f43
commit 794e22b09c
3 changed files with 138 additions and 10 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;

@ -32,6 +33,24 @@ public class AuthorMerger {

 	}

+	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
+		int pa = countAuthorsPids(a);
+		int pb = countAuthorsPids(b);
+		List<Author> base, enrich;
+		int sa = authorsSize(a);
+		int sb = authorsSize(b);
+
+		if (sa == sb) {
+			base = pa > pb ? a : b;
+			enrich = pa > pb ? b : a;
+		} else {
+			base = sa > sb ? a : b;
+			enrich = sa > sb ? b : a;
+		}
+		enrichPidFromList(base, enrich, threshold);
+		return base;
+	}
+
 	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
 		int pa = countAuthorsPids(a);
 		int pb = countAuthorsPids(b);
@ -39,20 +58,22 @@ public class AuthorMerger {
 		int sa = authorsSize(a);
 		int sb = authorsSize(b);

-		if (pa == pb) {
-			base = sa > sb ? a : b;
-			enrich = sa > sb ? b : a;
-		} else {
+		if (sa == sb) {
 			base = pa > pb ? a : b;
 			enrich = pa > pb ? b : a;
+		} else {
+			base = sa > sb ? a : b;
+			enrich = sa > sb ? b : a;
 		}
-		enrichPidFromList(base, enrich);
+		enrichPidFromList(base, enrich, THRESHOLD);
 		return base;
 	}

-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
 		if (base == null || enrich == null)
 			return;
+
+		//<pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
 		final Map<String, Author> basePidAuthorMap = base
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +84,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));

+		//<pid, Author> (list of pid that are missing in the other list)
 		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -74,6 +96,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(p, a)))
 			.collect(Collectors.toList());

+
 		pidToEnrich
 			.forEach(
 				a -> {
@ -83,10 +106,10 @@ public class AuthorMerger {
 						.max(Comparator.comparing(Tuple2::_1));

 					if (simAuthor.isPresent()) {
-						double th = THRESHOLD;
+						double th = threshold;
 						// increase the threshold if the surname is too short
 						if (simAuthor.get()._2().getSurname() != null
-							&& simAuthor.get()._2().getSurname().length() <= 3)
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
 							th = 0.99;

 						if (simAuthor.get()._1() > th) {
@ -156,7 +179,7 @@ public class AuthorMerger {
 	}

 	private static String normalize(final String s) {
-		return nfd(s)
+		String[] normalized = nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
@ -166,7 +189,12 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
-			.trim();
+			.trim()
+			.split(" ");
+
+		Arrays.sort(normalized);
+
+		return String.join(" ", normalized);
 	}

 	private static String nfd(final String s) {
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
@ -0,0 +1,97 @@
+package eu.dnetlib.dhp.oa.merge;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import scala.Tuple2;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class AuthorMergerTest {
+
+    String publicationsBasePath;
+
+    List<List<Author>> authors;
+
+    @BeforeEach
+    public void setUp() throws Exception {
+
+        publicationsBasePath = Paths
+                .get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
+                .toFile()
+                .getAbsolutePath();
+
+        authors =
+                readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
+                .stream()
+                .map(p -> p._2().getAuthor()).collect(Collectors.toList());
+
+    }
+
+    @Test
+    public void mergeTest() { //used in the dedup: threshold set to 0.95
+
+        for (List<Author> authors1: authors){
+            System.out.println("List " + (authors.indexOf(authors1)+1));
+            for (Author author: authors1){
+                System.out.println(authorToString(author));
+            }
+        }
+
+        List<Author> merge = AuthorMerger.merge(authors);
+
+        System.out.println("Merge ");
+        for (Author author: merge) {
+            System.out.println(authorToString(author));
+        }
+
+        Assertions.assertEquals(7, merge.size());
+
+    }
+
+    public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
+        List<Tuple2<String, T>> res = new ArrayList<>();
+        BufferedReader reader;
+        try {
+            reader = new BufferedReader(new FileReader(path));
+            String line = reader.readLine();
+            while (line != null) {
+                res
+                        .add(
+                                new Tuple2<>(
+                                        MapDocumentUtil.getJPathString("$.id", line),
+                                        new ObjectMapper().readValue(line, clazz)));
+                // read next line
+                line = reader.readLine();
+            }
+            reader.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+        return res;
+    }
+
+    public String authorToString(Author a){
+
+        String print = "Fullname = ";
+        print += a.getFullname() + " pid = [";
+        if (a.getPid() != null)
+            for (StructuredProperty sp : a.getPid()) {
+                print += sp.toComparableString() + " ";
+            }
+        print += "]";
+        return print;
+    }
+}
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json