forked from D-Net/dnet-hadoop
Merge pull request 'Enhance Dedup authors matching with algorithms used for ORCID enhancements (task 9690)' (#419) from dedup_authorsmatch_bytoken into beta
Reviewed-on: D-Net/dnet-hadoop#419
This commit is contained in:
commit
da333e9f4d
|
@ -497,9 +497,14 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
||||||
|
if (d1 == null || StringUtils.isBlank(d1.getValue())) {
|
||||||
|
return d2;
|
||||||
|
} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
|
||||||
|
return d1;
|
||||||
|
}
|
||||||
|
|
||||||
return Stream
|
return Stream
|
||||||
.of(d1, d2)
|
.of(d1, d2)
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.min(
|
.min(
|
||||||
Comparator
|
Comparator
|
||||||
.comparing(
|
.comparing(
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.util.AuthorMatchers;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@ComparatorClass("authorsMatch")
|
@ComparatorClass("authorsMatch")
|
||||||
public class AuthorsMatch extends AbstractListComparator {
|
public class AuthorsMatch extends AbstractListComparator {
|
||||||
|
@ -41,24 +43,36 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
public double compare(final List<String> left, final List<String> right, final Config conf) {
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (left.isEmpty() || right.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD)
|
||||||
return 1.0;
|
return 1.0;
|
||||||
|
|
||||||
int maxMiss = Integer.MAX_VALUE;
|
|
||||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
|
||||||
|
|
||||||
Double threshold = getDoubleParam("threshold");
|
Double threshold = getDoubleParam("threshold");
|
||||||
|
int maxMiss = Integer.MAX_VALUE;
|
||||||
|
|
||||||
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
|
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) {
|
||||||
maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
|
maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
int common = 0;
|
int common = 0;
|
||||||
|
|
||||||
|
List<String> a = new ArrayList<>(left);
|
||||||
|
List<String> b = new ArrayList<>(right);
|
||||||
|
|
||||||
|
common += AuthorMatchers
|
||||||
|
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchEqualsIgnoreCase)
|
||||||
|
.size() / 2;
|
||||||
|
common += AuthorMatchers
|
||||||
|
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchOrderedTokenAndAbbreviations)
|
||||||
|
.size() / 2;
|
||||||
|
|
||||||
|
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||||
|
|
||||||
// compare each element of List1 with each element of List2
|
// compare each element of List1 with each element of List2
|
||||||
|
int alreadyMatched = common;
|
||||||
for (int i = 0; i < a.size(); i++) {
|
for (int i = 0; i < a.size(); i++) {
|
||||||
Person p1 = new Person(a.get(i), false);
|
Person p1 = new Person(a.get(i), false);
|
||||||
|
|
||||||
|
@ -123,13 +137,13 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i - common > maxMiss) {
|
if (i - common - alreadyMatched > maxMiss) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// normalization factor to compute the score
|
// normalization factor to compute the score
|
||||||
int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
|
int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common);
|
||||||
|
|
||||||
if (TYPE.equals("percentage")) {
|
if (TYPE.equals("percentage")) {
|
||||||
return (double) common / normFactor;
|
return (double) common / normFactor;
|
||||||
|
@ -160,5 +174,4 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
public String normalization(String s) {
|
public String normalization(String s) {
|
||||||
return normalize(utf8(cleanup(s)));
|
return normalize(utf8(cleanup(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
package eu.dnetlib.dhp.enrich.orcid
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
import java.util.Locale
|
import java.util.Locale
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
|
import scala.util.control.Breaks.{break, breakable}
|
||||||
|
|
||||||
object ORCIDAuthorMatchers {
|
object AuthorMatchers {
|
||||||
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
||||||
|
|
||||||
val WORD_DIFF = 2
|
val WORD_DIFF = 2
|
||||||
|
@ -45,6 +46,7 @@ object ORCIDAuthorMatchers {
|
||||||
var res: Boolean = false
|
var res: Boolean = false
|
||||||
if (e1.length != 1 && e2.length != 1) {
|
if (e1.length != 1 && e2.length != 1) {
|
||||||
res = e1 == e2
|
res = e1 == e2
|
||||||
|
if (res)
|
||||||
longMatches += 1
|
longMatches += 1
|
||||||
} else {
|
} else {
|
||||||
res = true
|
res = true
|
||||||
|
@ -62,4 +64,49 @@ object ORCIDAuthorMatchers {
|
||||||
}
|
}
|
||||||
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def removeMatches(
|
||||||
|
graph_authors: java.util.List[String],
|
||||||
|
orcid_authors: java.util.List[String],
|
||||||
|
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
||||||
|
) : java.util.List[String] = {
|
||||||
|
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def removeMatches(
|
||||||
|
graph_authors: java.util.List[String],
|
||||||
|
orcid_authors: java.util.List[String],
|
||||||
|
matchingFunc: (String, String) => Boolean
|
||||||
|
) : java.util.List[String] = {
|
||||||
|
val matched = new java.util.ArrayList[String]()
|
||||||
|
|
||||||
|
if (graph_authors != null && !graph_authors.isEmpty) {
|
||||||
|
val ait = graph_authors.iterator
|
||||||
|
|
||||||
|
while (ait.hasNext) {
|
||||||
|
val author = ait.next()
|
||||||
|
val oit = orcid_authors.iterator
|
||||||
|
|
||||||
|
breakable {
|
||||||
|
while (oit.hasNext) {
|
||||||
|
val orcid = oit.next()
|
||||||
|
|
||||||
|
if (matchingFunc(author, orcid)) {
|
||||||
|
ait.remove()
|
||||||
|
oit.remove()
|
||||||
|
|
||||||
|
matched.add(author)
|
||||||
|
matched.add(orcid)
|
||||||
|
|
||||||
|
break()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
matched
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -43,15 +43,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.sx.OafUtils;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class SparkDedupTest implements Serializable {
|
public class SparkDedupTest implements Serializable {
|
||||||
|
static final boolean CHECK_CARDINALITIES = true;
|
||||||
|
|
||||||
@Mock(serializable = true)
|
@Mock(serializable = true)
|
||||||
ISLookUpService isLookUpService;
|
ISLookUpService isLookUpService;
|
||||||
|
@ -191,12 +189,13 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("ds_simrel = " + ds_simrel);
|
System.out.println("ds_simrel = " + ds_simrel);
|
||||||
System.out.println("orp_simrel = " + orp_simrel);
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(751, orgs_simrel);
|
assertEquals(751, orgs_simrel);
|
||||||
assertEquals(546, pubs_simrel);
|
assertEquals(566, pubs_simrel);
|
||||||
assertEquals(113, sw_simrel);
|
assertEquals(113, sw_simrel);
|
||||||
assertEquals(148, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
assertEquals(280, orp_simrel);
|
assertEquals(280, orp_simrel);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -239,21 +238,27 @@ public class SparkDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
|
||||||
assertEquals(751, orgs_simrel);
|
|
||||||
assertEquals(546, pubs_simrel);
|
|
||||||
assertEquals(148, ds_simrel);
|
|
||||||
assertEquals(280, orp_simrel);
|
|
||||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
|
||||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
|
||||||
// System.out.println("ds_simrel = " + ds_simrel);
|
|
||||||
// System.out.println("orp_simrel = " + orp_simrel);
|
|
||||||
|
|
||||||
// entities simrels to be different from the number of previous step (new simrels in the whitelist)
|
// entities simrels to be different from the number of previous step (new simrels in the whitelist)
|
||||||
Dataset<Row> sw_simrel = spark
|
Dataset<Row> sw_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"));
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"));
|
||||||
|
|
||||||
|
System.out.println("orgs_simrel = " + orgs_simrel);
|
||||||
|
System.out.println("pubs_simrel = " + pubs_simrel);
|
||||||
|
System.out.println("ds_simrel = " + ds_simrel);
|
||||||
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
System.out.println("sw_simrel = " + sw_simrel.count());
|
||||||
|
|
||||||
|
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
|
assertEquals(751, orgs_simrel);
|
||||||
|
assertEquals(566, pubs_simrel);
|
||||||
|
assertEquals(148, ds_simrel);
|
||||||
|
assertEquals(280, orp_simrel);
|
||||||
|
assertEquals(115, sw_simrel.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// check if the first relation in the whitelist exists
|
// check if the first relation in the whitelist exists
|
||||||
assertTrue(
|
assertTrue(
|
||||||
sw_simrel
|
sw_simrel
|
||||||
|
@ -272,10 +277,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0])
|
rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0])
|
||||||
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
||||||
.count() > 0);
|
.count() > 0);
|
||||||
|
|
||||||
assertEquals(115, sw_simrel.count());
|
|
||||||
// System.out.println("sw_simrel = " + sw_simrel.count());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -466,17 +467,19 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertTrue(dups.contains(r.getSource()));
|
assertTrue(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
|
System.out.println("pubs_mergerel = " + pubs.count());
|
||||||
|
System.out.println("sw_mergerel = " + sw_mergerel);
|
||||||
|
System.out.println("ds_mergerel = " + ds_mergerel);
|
||||||
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1268, orgs_mergerel);
|
||||||
assertEquals(1112, pubs.count());
|
assertEquals(1156, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
assertEquals(742, orp_mergerel);
|
assertEquals(742, orp_mergerel);
|
||||||
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
}
|
||||||
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
|
||||||
// System.out.println("sw_mergerel = " + sw_mergerel);
|
|
||||||
// System.out.println("ds_mergerel = " + ds_mergerel);
|
|
||||||
// System.out.println("orp_mergerel = " + orp_mergerel);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -552,17 +555,19 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertTrue(dups.contains(r.getSource()));
|
assertTrue(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
|
System.out.println("pubs_mergerel = " + pubs.count());
|
||||||
|
System.out.println("sw_mergerel = " + sw_mergerel);
|
||||||
|
System.out.println("ds_mergerel = " + ds_mergerel);
|
||||||
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1268, orgs_mergerel);
|
||||||
assertEquals(1112, pubs.count());
|
assertEquals(1156, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
assertEquals(742, orp_mergerel);
|
assertEquals(742, orp_mergerel);
|
||||||
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
}
|
||||||
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
|
||||||
// System.out.println("sw_mergerel = " + sw_mergerel);
|
|
||||||
// System.out.println("ds_mergerel = " + ds_mergerel);
|
|
||||||
// System.out.println("orp_mergerel = " + orp_mergerel);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -607,19 +612,21 @@ public class SparkDedupTest implements Serializable {
|
||||||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
|
System.out.println("orgs_deduprecord = " + orgs_deduprecord);
|
||||||
|
System.out.println("pubs_deduprecord = " + pubs.count());
|
||||||
|
System.out.println("sw_deduprecord = " + sw_deduprecord);
|
||||||
|
System.out.println("ds_deduprecord = " + ds_deduprecord);
|
||||||
|
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||||
|
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(86, orgs_deduprecord);
|
assertEquals(86, orgs_deduprecord);
|
||||||
assertEquals(91, pubs.count());
|
assertEquals(96, pubs.count());
|
||||||
assertEquals(47, sw_deduprecord);
|
assertEquals(47, sw_deduprecord);
|
||||||
assertEquals(97, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
assertEquals(92, orp_deduprecord);
|
assertEquals(92, orp_deduprecord);
|
||||||
|
}
|
||||||
|
|
||||||
verifyRoot_1(mapper, pubs);
|
verifyRoot_1(mapper, pubs);
|
||||||
|
|
||||||
// System.out.println("orgs_deduprecord = " + orgs_deduprecord);
|
|
||||||
// System.out.println("pubs_deduprecord = " + pubs_deduprecord);
|
|
||||||
// System.out.println("sw_deduprecord = " + sw_deduprecord);
|
|
||||||
// System.out.println("ds_deduprecord = " + ds_deduprecord);
|
|
||||||
// System.out.println("orp_deduprecord = " + orp_deduprecord);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
|
private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
|
||||||
|
@ -745,21 +752,23 @@ public class SparkDedupTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(925, publications);
|
System.out.println("publications = " + publications);
|
||||||
|
System.out.println("organizations = " + organizations);
|
||||||
|
System.out.println("projects = " + projects);
|
||||||
|
System.out.println("datasource = " + datasource);
|
||||||
|
System.out.println("software = " + softwares);
|
||||||
|
System.out.println("dataset = " + dataset);
|
||||||
|
System.out.println("otherresearchproduct = " + otherresearchproduct);
|
||||||
|
|
||||||
|
if (CHECK_CARDINALITIES) {
|
||||||
|
assertEquals(930, publications);
|
||||||
assertEquals(839, organizations);
|
assertEquals(839, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(196, softwares);
|
assertEquals(196, softwares);
|
||||||
assertEquals(389, dataset);
|
assertEquals(389, dataset);
|
||||||
assertEquals(520, otherresearchproduct);
|
assertEquals(520, otherresearchproduct);
|
||||||
|
}
|
||||||
// System.out.println("publications = " + publications);
|
|
||||||
// System.out.println("organizations = " + organizations);
|
|
||||||
// System.out.println("projects = " + projects);
|
|
||||||
// System.out.println("datasource = " + datasource);
|
|
||||||
// System.out.println("software = " + softwares);
|
|
||||||
// System.out.println("dataset = " + dataset);
|
|
||||||
// System.out.println("otherresearchproduct = " + otherresearchproduct);
|
|
||||||
|
|
||||||
long deletedOrgs = jsc
|
long deletedOrgs = jsc
|
||||||
.textFile(testDedupGraphBasePath + "/organization")
|
.textFile(testDedupGraphBasePath + "/organization")
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.enrich.orcid
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.schema.sx.OafUtils
|
import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||||
|
import eu.dnetlib.pace.util.AuthorMatchers
|
||||||
|
|
||||||
import java.util
|
import java.util
|
||||||
import scala.beans.BeanProperty
|
import scala.beans.BeanProperty
|
||||||
|
@ -39,7 +40,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
unmatched_authors,
|
unmatched_authors,
|
||||||
orcid_authors,
|
orcid_authors,
|
||||||
(author, orcid) =>
|
(author, orcid) =>
|
||||||
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||||
"fullName"
|
"fullName"
|
||||||
) ++
|
) ++
|
||||||
// Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
|
// Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
|
||||||
|
@ -47,7 +48,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
unmatched_authors,
|
unmatched_authors,
|
||||||
orcid_authors,
|
orcid_authors,
|
||||||
(author, orcid) =>
|
(author, orcid) =>
|
||||||
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
|
AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
|
||||||
"reversedFullName"
|
"reversedFullName"
|
||||||
) ++
|
) ++
|
||||||
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
|
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
|
||||||
|
@ -55,7 +56,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
unmatched_authors,
|
unmatched_authors,
|
||||||
orcid_authors,
|
orcid_authors,
|
||||||
(author, orcid) =>
|
(author, orcid) =>
|
||||||
ORCIDAuthorMatchers
|
AuthorMatchers
|
||||||
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||||
"orderedTokens"
|
"orderedTokens"
|
||||||
) ++
|
) ++
|
||||||
|
@ -63,7 +64,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
extractAndEnrichMatches(
|
extractAndEnrichMatches(
|
||||||
unmatched_authors,
|
unmatched_authors,
|
||||||
orcid_authors,
|
orcid_authors,
|
||||||
(author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
|
(author, orcid) => AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
|
||||||
"creditName"
|
"creditName"
|
||||||
) ++
|
) ++
|
||||||
// look after exact matches in ORCID otherNames
|
// look after exact matches in ORCID otherNames
|
||||||
|
@ -71,7 +72,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
unmatched_authors,
|
unmatched_authors,
|
||||||
orcid_authors,
|
orcid_authors,
|
||||||
(author, orcid) =>
|
(author, orcid) =>
|
||||||
orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
|
orcid.otherNames != null && AuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
|
||||||
"otherNames"
|
"otherNames"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package eu.dnetlib.dhp.enrich.orcid
|
package eu.dnetlib.dhp.enrich.orcid
|
||||||
|
|
||||||
import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations
|
import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
|
||||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue