diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index 0ff90e024..316891faf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -497,9 +497,14 @@ public class MergeUtils { } private static Field selectOldestDate(Field d1, Field d2) { + if (d1 == null || StringUtils.isBlank(d1.getValue())) { + return d2; + } else if (d2 == null || StringUtils.isBlank(d2.getValue())) { + return d1; + } + return Stream .of(d1, d2) - .filter(Objects::nonNull) .min( Comparator .comparing( diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index edad0ae2e..0921d7a64 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,16 +1,18 @@ package eu.dnetlib.pace.tree; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - import com.wcohen.ss.AbstractStringDistance; - import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.util.AuthorMatchers; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.stream.Collectors; @ComparatorClass("authorsMatch") public class AuthorsMatch extends AbstractListComparator { @@ -41,24 +43,36 @@ public class AuthorsMatch extends AbstractListComparator { } @Override - public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) + public double compare(final List left, final List right, final Config conf) { + if (left.isEmpty() || right.isEmpty()) return -1; - if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) + if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD) return 1.0; - int maxMiss = Integer.MAX_VALUE; - List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - Double threshold = getDoubleParam("threshold"); + int maxMiss = Integer.MAX_VALUE; - if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) { - maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size())); + if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) { + maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size())); } int common = 0; + + List a = new ArrayList<>(left); + List b = new ArrayList<>(right); + + common += AuthorMatchers + .removeMatches(a, b, (BiFunction) AuthorMatchers::matchEqualsIgnoreCase) + .size() / 2; + common += AuthorMatchers + .removeMatches(a, b, (BiFunction) AuthorMatchers::matchOrderedTokenAndAbbreviations) + .size() / 2; + + List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + // compare each element of List1 with each element of List2 + int alreadyMatched = common; for (int i = 0; i < a.size(); i++) { Person p1 = new Person(a.get(i), false); @@ -123,13 +137,13 @@ public class AuthorsMatch extends AbstractListComparator { } } - if (i - common > maxMiss) { + if (i - common - alreadyMatched > maxMiss) { return 0.0; } } // normalization factor to compute the score - int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common); + int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common); if (TYPE.equals("percentage")) { return (double) common / normFactor; @@ -160,5 +174,4 @@ public class AuthorsMatch extends AbstractListComparator { public String normalization(String s) { return normalize(utf8(cleanup(s))); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala similarity index 56% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala rename to dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala index 49574fe2d..116f515ed 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala @@ -1,9 +1,10 @@ -package eu.dnetlib.dhp.enrich.orcid +package eu.dnetlib.pace.util import java.util.Locale import java.util.regex.Pattern +import scala.util.control.Breaks.{break, breakable} -object ORCIDAuthorMatchers { +object AuthorMatchers { val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+") val WORD_DIFF = 2 @@ -45,7 +46,8 @@ object ORCIDAuthorMatchers { var res: Boolean = false if (e1.length != 1 && e2.length != 1) { res = e1 == e2 - longMatches += 1 + if (res) + longMatches += 1 } else { res = true shortMatches += 1 @@ -62,4 +64,49 @@ object ORCIDAuthorMatchers { } longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length) } + + def removeMatches( + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: java.util.function.BiFunction[String,String,Boolean] + ) : java.util.List[String] = { + removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b)) + } + + + def removeMatches( + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: (String, String) => Boolean + ) : java.util.List[String] = { + val matched = new java.util.ArrayList[String]() + + if (graph_authors != null && !graph_authors.isEmpty) { + val ait = graph_authors.iterator + + while (ait.hasNext) { + val author = ait.next() + val oit = orcid_authors.iterator + + breakable { + while (oit.hasNext) { + val orcid = oit.next() + + if (matchingFunc(author, orcid)) { + ait.remove() + oit.remove() + + matched.add(author) + matched.add(orcid) + + break() + } + } + } + } + } + + matched + } + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 8b3480e60..2c96b7399 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -43,15 +43,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.schema.sx.OafUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import scala.Tuple2; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class SparkDedupTest implements Serializable { + static final boolean CHECK_CARDINALITIES = true; @Mock(serializable = true) ISLookUpService isLookUpService; @@ -191,12 +189,13 @@ public class SparkDedupTest implements Serializable { System.out.println("ds_simrel = " + ds_simrel); System.out.println("orp_simrel = " + orp_simrel); - assertEquals(751, orgs_simrel); - assertEquals(546, pubs_simrel); - assertEquals(113, sw_simrel); - assertEquals(148, ds_simrel); - assertEquals(280, orp_simrel); - + if (CHECK_CARDINALITIES) { + assertEquals(751, orgs_simrel); + assertEquals(566, pubs_simrel); + assertEquals(113, sw_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); + } } @Test @@ -239,21 +238,27 @@ public class SparkDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct")) .count(); - // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) - assertEquals(751, orgs_simrel); - assertEquals(546, pubs_simrel); - assertEquals(148, ds_simrel); - assertEquals(280, orp_simrel); -// System.out.println("orgs_simrel = " + orgs_simrel); -// System.out.println("pubs_simrel = " + pubs_simrel); -// System.out.println("ds_simrel = " + ds_simrel); -// System.out.println("orp_simrel = " + orp_simrel); - // entities simrels to be different from the number of previous step (new simrels in the whitelist) Dataset sw_simrel = spark .read() .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software")); + System.out.println("orgs_simrel = " + orgs_simrel); + System.out.println("pubs_simrel = " + pubs_simrel); + System.out.println("ds_simrel = " + ds_simrel); + System.out.println("orp_simrel = " + orp_simrel); + System.out.println("sw_simrel = " + sw_simrel.count()); + + // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) + if (CHECK_CARDINALITIES) { + assertEquals(751, orgs_simrel); + assertEquals(566, pubs_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); + assertEquals(115, sw_simrel.count()); + } + + // check if the first relation in the whitelist exists assertTrue( sw_simrel @@ -272,10 +277,6 @@ public class SparkDedupTest implements Serializable { rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0]) && rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1])) .count() > 0); - - assertEquals(115, sw_simrel.count()); -// System.out.println("sw_simrel = " + sw_simrel.count()); - } @Test @@ -466,17 +467,19 @@ public class SparkDedupTest implements Serializable { assertTrue(dups.contains(r.getSource())); }); - assertEquals(1268, orgs_mergerel); - assertEquals(1112, pubs.count()); - assertEquals(292, sw_mergerel); - assertEquals(476, ds_mergerel); - assertEquals(742, orp_mergerel); -// System.out.println("orgs_mergerel = " + orgs_mergerel); -// System.out.println("pubs_mergerel = " + pubs_mergerel); -// System.out.println("sw_mergerel = " + sw_mergerel); -// System.out.println("ds_mergerel = " + ds_mergerel); -// System.out.println("orp_mergerel = " + orp_mergerel); + System.out.println("orgs_mergerel = " + orgs_mergerel); + System.out.println("pubs_mergerel = " + pubs.count()); + System.out.println("sw_mergerel = " + sw_mergerel); + System.out.println("ds_mergerel = " + ds_mergerel); + System.out.println("orp_mergerel = " + orp_mergerel); + if (CHECK_CARDINALITIES) { + assertEquals(1268, orgs_mergerel); + assertEquals(1156, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); + } } @Test @@ -552,17 +555,19 @@ public class SparkDedupTest implements Serializable { assertTrue(dups.contains(r.getSource())); }); - assertEquals(1268, orgs_mergerel); - assertEquals(1112, pubs.count()); - assertEquals(292, sw_mergerel); - assertEquals(476, ds_mergerel); - assertEquals(742, orp_mergerel); -// System.out.println("orgs_mergerel = " + orgs_mergerel); -// System.out.println("pubs_mergerel = " + pubs_mergerel); -// System.out.println("sw_mergerel = " + sw_mergerel); -// System.out.println("ds_mergerel = " + ds_mergerel); -// System.out.println("orp_mergerel = " + orp_mergerel); + System.out.println("orgs_mergerel = " + orgs_mergerel); + System.out.println("pubs_mergerel = " + pubs.count()); + System.out.println("sw_mergerel = " + sw_mergerel); + System.out.println("ds_mergerel = " + ds_mergerel); + System.out.println("orp_mergerel = " + orp_mergerel); + if (CHECK_CARDINALITIES) { + assertEquals(1268, orgs_mergerel); + assertEquals(1156, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); + } } @Test @@ -607,19 +612,21 @@ public class SparkDedupTest implements Serializable { testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") .count(); - assertEquals(86, orgs_deduprecord); - assertEquals(91, pubs.count()); - assertEquals(47, sw_deduprecord); - assertEquals(97, ds_deduprecord); - assertEquals(92, orp_deduprecord); + System.out.println("orgs_deduprecord = " + orgs_deduprecord); + System.out.println("pubs_deduprecord = " + pubs.count()); + System.out.println("sw_deduprecord = " + sw_deduprecord); + System.out.println("ds_deduprecord = " + ds_deduprecord); + System.out.println("orp_deduprecord = " + orp_deduprecord); + + if (CHECK_CARDINALITIES) { + assertEquals(86, orgs_deduprecord); + assertEquals(96, pubs.count()); + assertEquals(47, sw_deduprecord); + assertEquals(97, ds_deduprecord); + assertEquals(92, orp_deduprecord); + } verifyRoot_1(mapper, pubs); - -// System.out.println("orgs_deduprecord = " + orgs_deduprecord); -// System.out.println("pubs_deduprecord = " + pubs_deduprecord); -// System.out.println("sw_deduprecord = " + sw_deduprecord); -// System.out.println("ds_deduprecord = " + ds_deduprecord); -// System.out.println("orp_deduprecord = " + orp_deduprecord); } private static void verifyRoot_1(ObjectMapper mapper, Dataset pubs) { @@ -745,21 +752,23 @@ public class SparkDedupTest implements Serializable { .distinct() .count(); - assertEquals(925, publications); - assertEquals(839, organizations); - assertEquals(100, projects); - assertEquals(100, datasource); - assertEquals(196, softwares); - assertEquals(389, dataset); - assertEquals(520, otherresearchproduct); + System.out.println("publications = " + publications); + System.out.println("organizations = " + organizations); + System.out.println("projects = " + projects); + System.out.println("datasource = " + datasource); + System.out.println("software = " + softwares); + System.out.println("dataset = " + dataset); + System.out.println("otherresearchproduct = " + otherresearchproduct); -// System.out.println("publications = " + publications); -// System.out.println("organizations = " + organizations); -// System.out.println("projects = " + projects); -// System.out.println("datasource = " + datasource); -// System.out.println("software = " + softwares); -// System.out.println("dataset = " + dataset); -// System.out.println("otherresearchproduct = " + otherresearchproduct); + if (CHECK_CARDINALITIES) { + assertEquals(930, publications); + assertEquals(839, organizations); + assertEquals(100, projects); + assertEquals(100, datasource); + assertEquals(196, softwares); + assertEquals(389, dataset); + assertEquals(520, otherresearchproduct); + } long deletedOrgs = jsc .textFile(testDedupGraphBasePath + "/organization") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala index e2e7fada6..2e23a3a59 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.enrich.orcid import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty} import eu.dnetlib.dhp.schema.sx.OafUtils +import eu.dnetlib.pace.util.AuthorMatchers import java.util import scala.beans.BeanProperty @@ -39,7 +40,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName), + AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName), "fullName" ) ++ // Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName @@ -47,7 +48,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName), + AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName), "reversedFullName" ) ++ // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations @@ -55,7 +56,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers + AuthorMatchers .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName), "orderedTokens" ) ++ @@ -63,7 +64,7 @@ object ORCIDAuthorEnricher extends Serializable { extractAndEnrichMatches( unmatched_authors, orcid_authors, - (author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName), + (author, orcid) => AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName), "creditName" ) ++ // look after exact matches in ORCID otherNames @@ -71,7 +72,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala), + orcid.otherNames != null && AuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala), "otherNames" ) } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala index f109ebe24..f3a5fe77c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala @@ -1,6 +1,6 @@ package eu.dnetlib.dhp.enrich.orcid -import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations +import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue} import org.junit.jupiter.api.Test