Merge pull request 'Enhance Dedup authors matching with algorithms used for ORCID enhancements (task 9690)' (#419) from dedup_authorsmatch_bytoken into beta

Reviewed-on: D-Net/dnet-hadoop#419
2024-04-16 10:24:11 +02:00 · 2024-04-16 10:24:11 +02:00 · da333e9f4d
parent 43fd1de681 43b454399f
commit da333e9f4d
6 changed files with 169 additions and 94 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -497,9 +497,14 @@ public class MergeUtils {
 	}
 	private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
 		if (d1 == null || StringUtils.isBlank(d1.getValue())) {
 			return d2;
 		} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
 			return d1;
 		}
 		return Stream
 			.of(d1, d2)
 			.filter(Objects::nonNull)
 			.min(
 				Comparator
 					.comparing(
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -1,16 +1,18 @@
 package eu.dnetlib.pace.tree;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 import com.wcohen.ss.AbstractStringDistance;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Person;
 import eu.dnetlib.pace.tree.support.AbstractListComparator;
 import eu.dnetlib.pace.tree.support.ComparatorClass;
 import eu.dnetlib.pace.util.AuthorMatchers;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.function.BiFunction;
 import java.util.stream.Collectors;
@ComparatorClass("authorsMatch")
 public class AuthorsMatch extends AbstractListComparator {
@ -41,24 +43,36 @@ public class AuthorsMatch extends AbstractListComparator {
 	}
 	@Override
-	public double compare(final List<String> a, final List<String> b, final Config conf) {
+	public double compare(final List<String> left, final List<String> right, final Config conf) {
-		if (a.isEmpty() || b.isEmpty())
+		if (left.isEmpty() || right.isEmpty())
 			return -1;
-		if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
+		if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD)
 			return 1.0;
 		int maxMiss = Integer.MAX_VALUE;
 		List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
 		Double threshold = getDoubleParam("threshold");
 		int maxMiss = Integer.MAX_VALUE;
-		if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
+		if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) {
-			maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
+			maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size()));
 		}
 		int common = 0;
 		List<String> a = new ArrayList<>(left);
 		List<String> b = new ArrayList<>(right);
 		common += AuthorMatchers
 			.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchEqualsIgnoreCase)
 			.size() / 2;
 		common += AuthorMatchers
 			.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchOrderedTokenAndAbbreviations)
 			.size() / 2;
 		List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
 		// compare each element of List1 with each element of List2
 		int alreadyMatched = common;
 		for (int i = 0; i < a.size(); i++) {
 			Person p1 = new Person(a.get(i), false);
@ -123,13 +137,13 @@ public class AuthorsMatch extends AbstractListComparator {
 				}
 			}
-			if (i - common > maxMiss) {
+			if (i - common - alreadyMatched > maxMiss) {
 				return 0.0;
 			}
 		}
 		// normalization factor to compute the score
-		int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
+		int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common);
 		if (TYPE.equals("percentage")) {
 			return (double) common / normFactor;
@ -160,5 +174,4 @@ public class AuthorsMatch extends AbstractListComparator {
 	public String normalization(String s) {
 		return normalize(utf8(cleanup(s)));
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala
@ -1,9 +1,10 @@
-package eu.dnetlib.dhp.enrich.orcid
+package eu.dnetlib.pace.util
 import java.util.Locale
 import java.util.regex.Pattern
 import scala.util.control.Breaks.{break, breakable}
-object ORCIDAuthorMatchers {
+object AuthorMatchers {
  val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
  val WORD_DIFF = 2
@ -45,6 +46,7 @@ object ORCIDAuthorMatchers {
        var res: Boolean = false
        if (e1.length != 1 && e2.length != 1) {
          res = e1 == e2
          if (res)
            longMatches += 1
        } else {
          res = true
@ -62,4 +64,49 @@ object ORCIDAuthorMatchers {
    }
    longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
  }
  def removeMatches(
                     graph_authors: java.util.List[String],
                     orcid_authors: java.util.List[String],
                     matchingFunc: java.util.function.BiFunction[String,String,Boolean]
                   ) : java.util.List[String] = {
    removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
  }
  def removeMatches(
                                       graph_authors: java.util.List[String],
                                       orcid_authors: java.util.List[String],
                                       matchingFunc: (String, String) => Boolean
                                     ) : java.util.List[String]  = {
    val matched = new java.util.ArrayList[String]()
    if (graph_authors != null && !graph_authors.isEmpty) {
      val ait = graph_authors.iterator
      while (ait.hasNext) {
        val author = ait.next()
        val oit = orcid_authors.iterator
        breakable {
          while (oit.hasNext) {
            val orcid = oit.next()
            if (matchingFunc(author, orcid)) {
              ait.remove()
              oit.remove()
              matched.add(author)
              matched.add(orcid)
              break()
            }
          }
        }
      }
    }
    matched
  }
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@ -43,15 +43,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.schema.sx.OafUtils;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import scala.Tuple2;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
 public class SparkDedupTest implements Serializable {
 	static final boolean CHECK_CARDINALITIES = true;
 	@Mock(serializable = true)
 	ISLookUpService isLookUpService;
@ -191,12 +189,13 @@ public class SparkDedupTest implements Serializable {
 		System.out.println("ds_simrel = " + ds_simrel);
 		System.out.println("orp_simrel = " + orp_simrel);
 		if (CHECK_CARDINALITIES) {
 			assertEquals(751, orgs_simrel);
-		assertEquals(546, pubs_simrel);
+			assertEquals(566, pubs_simrel);
 			assertEquals(113, sw_simrel);
 			assertEquals(148, ds_simrel);
 			assertEquals(280, orp_simrel);
-
+		}
 	}
 	@Test
@ -239,21 +238,27 @@ public class SparkDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
 			.count();
 		// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
 		assertEquals(751, orgs_simrel);
 		assertEquals(546, pubs_simrel);
 		assertEquals(148, ds_simrel);
 		assertEquals(280, orp_simrel);
 //		System.out.println("orgs_simrel = " + orgs_simrel);
 //		System.out.println("pubs_simrel = " + pubs_simrel);
 //		System.out.println("ds_simrel = " + ds_simrel);
 //		System.out.println("orp_simrel = " + orp_simrel);
 		// entities simrels to be different from the number of previous step (new simrels in the whitelist)
 		Dataset<Row> sw_simrel = spark
 			.read()
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"));
 		System.out.println("orgs_simrel = " + orgs_simrel);
 		System.out.println("pubs_simrel = " + pubs_simrel);
 		System.out.println("ds_simrel = " + ds_simrel);
 		System.out.println("orp_simrel = " + orp_simrel);
 		System.out.println("sw_simrel = " + sw_simrel.count());
 		// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
 		if (CHECK_CARDINALITIES) {
 			assertEquals(751, orgs_simrel);
 			assertEquals(566, pubs_simrel);
 			assertEquals(148, ds_simrel);
 			assertEquals(280, orp_simrel);
 			assertEquals(115, sw_simrel.count());
 		}
 		// check if the first relation in the whitelist exists
 		assertTrue(
 			sw_simrel
@ -272,10 +277,6 @@ public class SparkDedupTest implements Serializable {
 					rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0])
 						&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
 				.count() > 0);
 		assertEquals(115, sw_simrel.count());
 //		System.out.println("sw_simrel = " + sw_simrel.count());
 	}
 	@Test
@ -466,17 +467,19 @@ public class SparkDedupTest implements Serializable {
 			assertTrue(dups.contains(r.getSource()));
 		});
 		System.out.println("orgs_mergerel = " + orgs_mergerel);
 		System.out.println("pubs_mergerel = " + pubs.count());
 		System.out.println("sw_mergerel = " + sw_mergerel);
 		System.out.println("ds_mergerel = " + ds_mergerel);
 		System.out.println("orp_mergerel = " + orp_mergerel);
 		if (CHECK_CARDINALITIES) {
 			assertEquals(1268, orgs_mergerel);
-		assertEquals(1112, pubs.count());
+			assertEquals(1156, pubs.count());
 			assertEquals(292, sw_mergerel);
 			assertEquals(476, ds_mergerel);
 			assertEquals(742, orp_mergerel);
-//		System.out.println("orgs_mergerel = " + orgs_mergerel);
+		}
 //		System.out.println("pubs_mergerel = " + pubs_mergerel);
 //		System.out.println("sw_mergerel = " + sw_mergerel);
 //		System.out.println("ds_mergerel = " + ds_mergerel);
 //		System.out.println("orp_mergerel = " + orp_mergerel);
 	}
 	@Test
@ -552,17 +555,19 @@ public class SparkDedupTest implements Serializable {
 			assertTrue(dups.contains(r.getSource()));
 		});
 		System.out.println("orgs_mergerel = " + orgs_mergerel);
 		System.out.println("pubs_mergerel = " + pubs.count());
 		System.out.println("sw_mergerel = " + sw_mergerel);
 		System.out.println("ds_mergerel = " + ds_mergerel);
 		System.out.println("orp_mergerel = " + orp_mergerel);
 		if (CHECK_CARDINALITIES) {
 			assertEquals(1268, orgs_mergerel);
-		assertEquals(1112, pubs.count());
+			assertEquals(1156, pubs.count());
 			assertEquals(292, sw_mergerel);
 			assertEquals(476, ds_mergerel);
 			assertEquals(742, orp_mergerel);
-//		System.out.println("orgs_mergerel = " + orgs_mergerel);
+		}
 //		System.out.println("pubs_mergerel = " + pubs_mergerel);
 //		System.out.println("sw_mergerel = " + sw_mergerel);
 //		System.out.println("ds_mergerel = " + ds_mergerel);
 //		System.out.println("orp_mergerel = " + orp_mergerel);
 	}
 	@Test
@ -607,19 +612,21 @@ public class SparkDedupTest implements Serializable {
 				testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
 			.count();
 		System.out.println("orgs_deduprecord = " + orgs_deduprecord);
 		System.out.println("pubs_deduprecord = " + pubs.count());
 		System.out.println("sw_deduprecord = " + sw_deduprecord);
 		System.out.println("ds_deduprecord = " + ds_deduprecord);
 		System.out.println("orp_deduprecord = " + orp_deduprecord);
 		if (CHECK_CARDINALITIES) {
 			assertEquals(86, orgs_deduprecord);
-		assertEquals(91, pubs.count());
+			assertEquals(96, pubs.count());
 			assertEquals(47, sw_deduprecord);
 			assertEquals(97, ds_deduprecord);
 			assertEquals(92, orp_deduprecord);
 		}
 		verifyRoot_1(mapper, pubs);
 //		System.out.println("orgs_deduprecord = " + orgs_deduprecord);
 //		System.out.println("pubs_deduprecord = " + pubs_deduprecord);
 //		System.out.println("sw_deduprecord = " + sw_deduprecord);
 //		System.out.println("ds_deduprecord = " + ds_deduprecord);
 //		System.out.println("orp_deduprecord = " + orp_deduprecord);
 	}
 	private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
@ -745,21 +752,23 @@ public class SparkDedupTest implements Serializable {
 			.distinct()
 			.count();
-		assertEquals(925, publications);
+		System.out.println("publications = " + publications);
 		System.out.println("organizations = " + organizations);
 		System.out.println("projects = " + projects);
 		System.out.println("datasource = " + datasource);
 		System.out.println("software = " + softwares);
 		System.out.println("dataset = " + dataset);
 		System.out.println("otherresearchproduct = " + otherresearchproduct);
 		if (CHECK_CARDINALITIES) {
 			assertEquals(930, publications);
 			assertEquals(839, organizations);
 			assertEquals(100, projects);
 			assertEquals(100, datasource);
 			assertEquals(196, softwares);
 			assertEquals(389, dataset);
 			assertEquals(520, otherresearchproduct);
-
+		}
 //		System.out.println("publications = " + publications);
 //		System.out.println("organizations = " + organizations);
 //		System.out.println("projects = " + projects);
 //		System.out.println("datasource = " + datasource);
 //		System.out.println("software = " + softwares);
 //		System.out.println("dataset = " + dataset);
 //		System.out.println("otherresearchproduct = " + otherresearchproduct);
 		long deletedOrgs = jsc
 			.textFile(testDedupGraphBasePath + "/organization")
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.enrich.orcid
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.OafUtils
 import eu.dnetlib.pace.util.AuthorMatchers
 import java.util
 import scala.beans.BeanProperty
@ -39,7 +40,7 @@ object ORCIDAuthorEnricher extends Serializable {
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
-          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
+          AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
        "fullName"
      ) ++
      // Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
@ -47,7 +48,7 @@ object ORCIDAuthorEnricher extends Serializable {
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
-          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
+          AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
        "reversedFullName"
      ) ++
      // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
@ -55,7 +56,7 @@ object ORCIDAuthorEnricher extends Serializable {
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
-          ORCIDAuthorMatchers
+          AuthorMatchers
            .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
        "orderedTokens"
      ) ++
@ -63,7 +64,7 @@ object ORCIDAuthorEnricher extends Serializable {
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
-        (author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
+        (author, orcid) => AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
        "creditName"
      ) ++
      // look after exact matches in  ORCID otherNames
@ -71,7 +72,7 @@ object ORCIDAuthorEnricher extends Serializable {
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
-          orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
+          orcid.otherNames != null && AuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
        "otherNames"
      )
    }
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@ -1,6 +1,6 @@
 package eu.dnetlib.dhp.enrich.orcid
-import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations
+import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
 import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
 import org.junit.jupiter.api.Test