forked from D-Net/dnet-hadoop
added further test case
This commit is contained in:
parent
6bc7dbeca7
commit
1513174d7e
|
@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.toJavaRDD();
|
.toJavaRDD();
|
||||||
// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/");
|
||||||
logger.info("Enriched works RDD ready.");
|
logger.info("Enriched works RDD ready.");
|
||||||
|
|
||||||
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
|
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
|
||||||
|
|
|
@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
public class AuthorMatcher {
|
public class AuthorMatcher {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||||
private static final Double threshold = 0.8;
|
public static final Double threshold = 0.8;
|
||||||
|
|
||||||
public static void match(AuthorData author, List<Contributor> contributors)
|
public static void match(AuthorData author, List<Contributor> contributors)
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
@ -41,16 +41,35 @@ public class AuthorMatcher {
|
||||||
int matchCounter = 0;
|
int matchCounter = 0;
|
||||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
Contributor contributor = null;
|
Contributor contributor = null;
|
||||||
contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
|
contributors
|
||||||
if (simpleMatch(c.getCreditName(), author.getName()) ||
|
.stream()
|
||||||
simpleMatch(c.getCreditName(), author.getSurname()) ||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
simpleMatch(c.getCreditName(), author.getOtherName())) {
|
.forEach(c -> {
|
||||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
if (simpleMatch(c.getCreditName(), author.getName()) ||
|
||||||
c.setSimpleMatch(true);
|
simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||||
}
|
simpleMatch(c.getCreditName(), author.getOtherName())) {
|
||||||
});
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
if (matchCounters.get(0) == 1) {
|
if (matchCounters.get(0) == 1) {
|
||||||
updateAuthorsSimpleMatch(contributors, author);
|
updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
} else if (matchCounters.get(0) == 0) {
|
||||||
|
Optional<Contributor> optCon = contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
Contributor bestMatchContributor = null;
|
||||||
|
if (optCon.isPresent()) {
|
||||||
|
bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
|
}
|
||||||
} else if (matchCounters.get(0) > 1) {
|
} else if (matchCounters.get(0) > 1) {
|
||||||
Optional<Contributor> optCon = contributors
|
Optional<Contributor> optCon = contributors
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -68,19 +87,18 @@ public class AuthorMatcher {
|
||||||
bestMatchContributor.setBestMatch(true);
|
bestMatchContributor.setBestMatch(true);
|
||||||
updateAuthorsSimilarityMatch(contributors, author);
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean simpleMatch(String name, String searchValue) {
|
public static boolean simpleMatch(String name, String searchValue) {
|
||||||
if (searchValue == null) {
|
if (searchValue == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return normalize(name).contains(normalize(searchValue));
|
return normalize(name).contains(normalize(searchValue));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
public static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||||
String[] contributorSplitted = contributor.split(" ");
|
String[] contributorSplitted = contributor.split(" ");
|
||||||
if (contributorSplitted.length == 0) {
|
if (contributorSplitted.length == 0) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
@ -106,7 +124,7 @@ public class AuthorMatcher {
|
||||||
return sm2;
|
return sm2;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
@ -115,7 +133,7 @@ public class AuthorMatcher {
|
||||||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
public static String normalize(final String s) {
|
||||||
if (s == null) {
|
if (s == null) {
|
||||||
return new String("");
|
return new String("");
|
||||||
}
|
}
|
||||||
|
@ -140,7 +158,7 @@ public class AuthorMatcher {
|
||||||
return surname + " " + name;
|
return surname + " " + name;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
contributors.forEach(c -> {
|
contributors.forEach(c -> {
|
||||||
if (c.isSimpleMatch()) {
|
if (c.isSimpleMatch()) {
|
||||||
c.setName(author.getName());
|
c.setName(author.getName());
|
||||||
|
@ -151,7 +169,7 @@ public class AuthorMatcher {
|
||||||
updateRanks(contributors);
|
updateRanks(contributors);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
contributors
|
contributors
|
||||||
.stream()
|
.stream()
|
||||||
.filter(c -> c.isBestMatch())
|
.filter(c -> c.isBestMatch())
|
||||||
|
|
|
@ -38,12 +38,9 @@ public class OrcidNoDoiTest {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
|
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
|
||||||
|
|
||||||
String nameA = "Khairy";
|
static String nameA = "Khairy";
|
||||||
String surnameA = "Abdel Dayem";
|
static String surnameA = "Abdel Dayem";
|
||||||
String otherNameA = "Dayem MKA";
|
static String orcidIdA = "0000-0003-2760-1191";
|
||||||
String nameB = "K";
|
|
||||||
String surnameB = "Abdel-Dayem";
|
|
||||||
String orcidIdA = "0000-0003-2760-1191";
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void readPublicationFieldsTest()
|
public void readPublicationFieldsTest()
|
||||||
|
@ -99,7 +96,7 @@ public class OrcidNoDoiTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void authorMatchTest() throws Exception {
|
public void authorDoubleMatchTest() throws Exception {
|
||||||
logger.info("running authorSimpleMatchTest ....");
|
logger.info("running authorSimpleMatchTest ....");
|
||||||
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||||
AuthorData author = new AuthorData();
|
AuthorData author = new AuthorData();
|
||||||
|
@ -129,16 +126,8 @@ public class OrcidNoDoiTest {
|
||||||
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
|
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
|
||||||
|
|
||||||
AuthorMatcher.match(author, workData.getContributors());
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
GsonBuilder builder = new GsonBuilder();
|
|
||||||
Gson gson = builder.create();
|
|
||||||
logger.info(gson.toJson(workData));
|
|
||||||
|
|
||||||
assertTrue(workData.getContributors().size() == 6);
|
assertTrue(workData.getContributors().size() == 6);
|
||||||
Contributor c = workData.getContributors().get(0);
|
|
||||||
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
|
|
||||||
assertTrue(c.getName().equals("Khairy"));
|
|
||||||
assertTrue(c.getSurname().equals("Abdel Dayem"));
|
|
||||||
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -180,4 +169,166 @@ public class OrcidNoDoiTest {
|
||||||
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
|
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
|
||||||
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
|
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void authorSimpleMatchTest() throws Exception {
|
||||||
|
String orcidWork = "activity_work_0000-0002-5982-8983.xml";
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName("Parkhouse");
|
||||||
|
author.setSurname("H.");
|
||||||
|
author.setOid("0000-0002-5982-8983");
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
assertNotNull(workData);
|
||||||
|
|
||||||
|
Contributor a = workData.getContributors().get(0);
|
||||||
|
assertTrue(a.getCreditName().equals("Parkhouse, H."));
|
||||||
|
|
||||||
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
|
|
||||||
|
assertTrue(workData.getContributors().size() == 2);
|
||||||
|
Contributor c = workData.getContributors().get(0);
|
||||||
|
assertTrue(c.getOid().equals("0000-0002-5982-8983"));
|
||||||
|
assertTrue(c.getName().equals("Parkhouse"));
|
||||||
|
assertTrue(c.getSurname().equals("H."));
|
||||||
|
assertTrue(c.getCreditName().equals("Parkhouse, H."));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void match() {
|
||||||
|
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName("Joe");
|
||||||
|
author.setSurname("Dodge");
|
||||||
|
author.setOid("0000-1111-2222-3333");
|
||||||
|
Contributor contributor = new Contributor();
|
||||||
|
contributor.setCreditName("Joe Dodge");
|
||||||
|
List<Contributor> contributors = Arrays.asList(contributor);
|
||||||
|
AuthorMatcher am = new AuthorMatcher();
|
||||||
|
int matchCounter = 0;
|
||||||
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.forEach(c -> {
|
||||||
|
if (am.simpleMatch(c.getCreditName(), author.getName()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), author.getOtherName())) {
|
||||||
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
assertTrue(matchCounters.get(0) == 1);
|
||||||
|
am.updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
assertTrue(contributors.get(0).getName().equals("Joe"));
|
||||||
|
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
|
||||||
|
assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
|
||||||
|
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
|
||||||
|
|
||||||
|
AuthorData authorX = new AuthorData();
|
||||||
|
authorX.setName(nameA);
|
||||||
|
authorX.setSurname(surnameA);
|
||||||
|
authorX.setOid(orcidIdA);
|
||||||
|
Contributor contributorA = new Contributor();
|
||||||
|
contributorA.setCreditName("Abdel-Dayem Khai");
|
||||||
|
Contributor contributorB = new Contributor();
|
||||||
|
contributorB.setCreditName("Abdel-Dayem Fake");
|
||||||
|
List<Contributor> contributorList = new ArrayList<>();
|
||||||
|
contributorList.add(contributorA);
|
||||||
|
contributorList.add(contributorB);
|
||||||
|
int matchCounter2 = 0;
|
||||||
|
List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
|
||||||
|
contributorList
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.forEach(c -> {
|
||||||
|
if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), authorX.getOtherName())) {
|
||||||
|
int currentCounter = matchCounters2.get(0);
|
||||||
|
currentCounter += 1;
|
||||||
|
matchCounters2.set(0, currentCounter);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
assertTrue(matchCounters2.get(0) == 2);
|
||||||
|
assertTrue(contributorList.get(0).isSimpleMatch());
|
||||||
|
assertTrue(contributorList.get(1).isSimpleMatch());
|
||||||
|
|
||||||
|
Optional<Contributor> optCon = contributorList
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isSimpleMatch())
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= AuthorMatcher.threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
assertTrue(optCon.isPresent());
|
||||||
|
|
||||||
|
final Contributor bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
|
||||||
|
assertTrue(contributorList.get(0).isBestMatch());
|
||||||
|
assertTrue(!contributorList.get(1).isBestMatch());
|
||||||
|
am.updateAuthorsSimilarityMatch(contributorList, authorX);
|
||||||
|
assertTrue(contributorList.get(0).getName().equals(nameA));
|
||||||
|
assertTrue(contributorList.get(0).getSurname().equals(surnameA));
|
||||||
|
assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
|
||||||
|
assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
|
||||||
|
assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void authorBestMatchTest() throws Exception {
|
||||||
|
String name = "Khairy";
|
||||||
|
String surname = "Abdel Dayem";
|
||||||
|
String orcidWork = "activity_work_0000-0003-2760-1191.xml";
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName(name);
|
||||||
|
author.setSurname(surname);
|
||||||
|
author.setOid(orcidIdA);
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
|
assertTrue(workData.getContributors().size() == 5);
|
||||||
|
List<Contributor> c = workData.getContributors();
|
||||||
|
assertTrue(c.get(0).getName().equals(name));
|
||||||
|
assertTrue(c.get(0).getSurname().equals(surname));
|
||||||
|
assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
|
||||||
|
assertTrue(c.get(0).getOid().equals(orcidIdA));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,7 +68,7 @@
|
||||||
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||||
<work:contributors>
|
<work:contributors>
|
||||||
<work:contributor>
|
<work:contributor>
|
||||||
<work:credit-name>Abdel-Dayem K</work:credit-name>
|
<work:credit-name>Khair Abde Daye</work:credit-name>
|
||||||
<work:contributor-attributes>
|
<work:contributor-attributes>
|
||||||
<work:contributor-sequence>first</work:contributor-sequence>
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
<work:contributor-role>author</work:contributor-role>
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
|
Loading…
Reference in New Issue