bug fix in the authormerge: now authors with higher size have priority, normalization of author name fixed

2020-12-21 17:51:42 +01:00 · 2020-12-21 17:51:42 +01:00 · 794e22b09c
parent 6cb0dc3f43
commit 794e22b09c
3 changed files with 138 additions and 10 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.lang3.StringUtils;
@ -32,6 +33,24 @@ public class AuthorMerger {
 	}
 	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
 		int pa = countAuthorsPids(a);
 		int pb = countAuthorsPids(b);
 		List<Author> base, enrich;
 		int sa = authorsSize(a);
 		int sb = authorsSize(b);
 		if (sa == sb) {
 			base = pa > pb ? a : b;
 			enrich = pa > pb ? b : a;
 		} else {
 			base = sa > sb ? a : b;
 			enrich = sa > sb ? b : a;
 		}
 		enrichPidFromList(base, enrich, threshold);
 		return base;
 	}
 	public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
 		int pa = countAuthorsPids(a);
 		int pb = countAuthorsPids(b);
@ -39,20 +58,22 @@ public class AuthorMerger {
 		int sa = authorsSize(a);
 		int sb = authorsSize(b);
-		if (pa == pb) {
+		if (sa == sb) {
 			base = sa > sb ? a : b;
 			enrich = sa > sb ? b : a;
 		} else {
 			base = pa > pb ? a : b;
 			enrich = pa > pb ? b : a;
 		} else {
 			base = sa > sb ? a : b;
 			enrich = sa > sb ? b : a;
 		}
-		enrichPidFromList(base, enrich);
+		enrichPidFromList(base, enrich, THRESHOLD);
 		return base;
 	}
-	private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+	private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
 		if (base == null || enrich == null)
 			return;
 		//<pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
 		final Map<String, Author> basePidAuthorMap = base
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +84,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(pidToComparableString(p), a)))
 			.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
 		//<pid, Author> (list of pid that are missing in the other list)
 		final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
 			.stream()
 			.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -74,6 +96,7 @@ public class AuthorMerger {
 					.map(p -> new Tuple2<>(p, a)))
 			.collect(Collectors.toList());
 		pidToEnrich
 			.forEach(
 				a -> {
@ -83,10 +106,10 @@ public class AuthorMerger {
 						.max(Comparator.comparing(Tuple2::_1));
 					if (simAuthor.isPresent()) {
-						double th = THRESHOLD;
+						double th = threshold;
 						// increase the threshold if the surname is too short
 						if (simAuthor.get()._2().getSurname() != null
-							&& simAuthor.get()._2().getSurname().length() <= 3)
+							&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
 							th = 0.99;
 						if (simAuthor.get()._1() > th) {
@ -156,7 +179,7 @@ public class AuthorMerger {
 	}
 	private static String normalize(final String s) {
-		return nfd(s)
+		String[] normalized = nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
@ -166,7 +189,12 @@ public class AuthorMerger {
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
-			.trim();
+			.trim()
 			.split(" ");
 		Arrays.sort(normalized);
 		return String.join(" ", normalized);
 	}
 	private static String nfd(final String s) {
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java
@ -0,0 +1,97 @@
 package eu.dnetlib.dhp.oa.merge;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import scala.Tuple2;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 public class AuthorMergerTest {
    String publicationsBasePath;
    List<List<Author>> authors;
    @BeforeEach
    public void setUp() throws Exception {
        publicationsBasePath = Paths
                .get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
                .toFile()
                .getAbsolutePath();
        authors =
                readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
                .stream()
                .map(p -> p._2().getAuthor()).collect(Collectors.toList());
    }
    @Test
    public void mergeTest() { //used in the dedup: threshold set to 0.95
        for (List<Author> authors1: authors){
            System.out.println("List " + (authors.indexOf(authors1)+1));
            for (Author author: authors1){
                System.out.println(authorToString(author));
            }
        }
        List<Author> merge = AuthorMerger.merge(authors);
        System.out.println("Merge ");
        for (Author author: merge) {
            System.out.println(authorToString(author));
        }
        Assertions.assertEquals(7, merge.size());
    }
    public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
        List<Tuple2<String, T>> res = new ArrayList<>();
        BufferedReader reader;
        try {
            reader = new BufferedReader(new FileReader(path));
            String line = reader.readLine();
            while (line != null) {
                res
                        .add(
                                new Tuple2<>(
                                        MapDocumentUtil.getJPathString("$.id", line),
                                        new ObjectMapper().readValue(line, clazz)));
                // read next line
                line = reader.readLine();
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return res;
    }
    public String authorToString(Author a){
        String print = "Fullname = ";
        print += a.getFullname() + " pid = [";
        if (a.getPid() != null)
            for (StructuredProperty sp : a.getPid()) {
                print += sp.toComparableString() + " ";
            }
        print += "]";
        return print;
    }
 }
--- a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json