orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
4 changed files with 202 additions and 33 deletions
Showing only changes of commit 1513174d7e - Show all commits

View File

@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks {
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.filter(Objects::nonNull)
.toJavaRDD();
// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/");
logger.info("Enriched works RDD ready.");
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");

View File

@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
public class AuthorMatcher {
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
private static final Double threshold = 0.8;
public static final Double threshold = 0.8;
public static void match(AuthorData author, List<Contributor> contributors)
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
@ -41,7 +41,10 @@ public class AuthorMatcher {
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (simpleMatch(c.getCreditName(), author.getName()) ||
simpleMatch(c.getCreditName(), author.getSurname()) ||
simpleMatch(c.getCreditName(), author.getOtherName())) {
@ -51,6 +54,22 @@ public class AuthorMatcher {
});
if (matchCounters.get(0) == 1) {
updateAuthorsSimpleMatch(contributors, author);
} else if (matchCounters.get(0) == 0) {
Optional<Contributor> optCon = contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
updateAuthorsSimilarityMatch(contributors, author);
}
} else if (matchCounters.get(0) > 1) {
Optional<Contributor> optCon = contributors
.stream()
@ -68,19 +87,18 @@ public class AuthorMatcher {
bestMatchContributor.setBestMatch(true);
updateAuthorsSimilarityMatch(contributors, author);
}
}
}
private static boolean simpleMatch(String name, String searchValue) {
public static boolean simpleMatch(String name, String searchValue) {
if (searchValue == null) {
return false;
}
return normalize(name).contains(normalize(searchValue));
}
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
public static Double bestMatch(String authorSurname, String authorName, String contributor) {
String[] contributorSplitted = contributor.split(" ");
if (contributorSplitted.length == 0) {
return 0.0;
@ -106,7 +124,7 @@ public class AuthorMatcher {
return sm2;
}
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
return score;
}
@ -115,7 +133,7 @@ public class AuthorMatcher {
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static String normalize(final String s) {
public static String normalize(final String s) {
if (s == null) {
return new String("");
}
@ -140,7 +158,7 @@ public class AuthorMatcher {
return surname + " " + name;
}
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
contributors.forEach(c -> {
if (c.isSimpleMatch()) {
c.setName(author.getName());
@ -151,7 +169,7 @@ public class AuthorMatcher {
updateRanks(contributors);
}
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
contributors
.stream()
.filter(c -> c.isBestMatch())

View File

@ -38,12 +38,9 @@ public class OrcidNoDoiTest {
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
String nameA = "Khairy";
String surnameA = "Abdel Dayem";
String otherNameA = "Dayem MKA";
String nameB = "K";
String surnameB = "Abdel-Dayem";
String orcidIdA = "0000-0003-2760-1191";
static String nameA = "Khairy";
static String surnameA = "Abdel Dayem";
static String orcidIdA = "0000-0003-2760-1191";
@Test
public void readPublicationFieldsTest()
@ -99,7 +96,7 @@ public class OrcidNoDoiTest {
}
@Test
public void authorMatchTest() throws Exception {
public void authorDoubleMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ....");
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData();
@ -129,16 +126,8 @@ public class OrcidNoDoiTest {
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
AuthorMatcher.match(author, workData.getContributors());
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
logger.info(gson.toJson(workData));
assertTrue(workData.getContributors().size() == 6);
Contributor c = workData.getContributors().get(0);
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
assertTrue(c.getName().equals("Khairy"));
assertTrue(c.getSurname().equals("Abdel Dayem"));
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
}
@Test
@ -180,4 +169,166 @@ public class OrcidNoDoiTest {
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
}
@Test
public void authorSimpleMatchTest() throws Exception {
String orcidWork = "activity_work_0000-0002-5982-8983.xml";
AuthorData author = new AuthorData();
author.setName("Parkhouse");
author.setSurname("H.");
author.setOid("0000-0002-5982-8983");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
Contributor a = workData.getContributors().get(0);
assertTrue(a.getCreditName().equals("Parkhouse, H."));
AuthorMatcher.match(author, workData.getContributors());
assertTrue(workData.getContributors().size() == 2);
Contributor c = workData.getContributors().get(0);
assertTrue(c.getOid().equals("0000-0002-5982-8983"));
assertTrue(c.getName().equals("Parkhouse"));
assertTrue(c.getSurname().equals("H."));
assertTrue(c.getCreditName().equals("Parkhouse, H."));
}
@Test
public void match() {
AuthorData author = new AuthorData();
author.setName("Joe");
author.setSurname("Dodge");
author.setOid("0000-1111-2222-3333");
Contributor contributor = new Contributor();
contributor.setCreditName("Joe Dodge");
List<Contributor> contributors = Arrays.asList(contributor);
AuthorMatcher am = new AuthorMatcher();
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (am.simpleMatch(c.getCreditName(), author.getName()) ||
am.simpleMatch(c.getCreditName(), author.getSurname()) ||
am.simpleMatch(c.getCreditName(), author.getOtherName())) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
assertTrue(matchCounters.get(0) == 1);
am.updateAuthorsSimpleMatch(contributors, author);
assertTrue(contributors.get(0).getName().equals("Joe"));
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
AuthorData authorX = new AuthorData();
authorX.setName(nameA);
authorX.setSurname(surnameA);
authorX.setOid(orcidIdA);
Contributor contributorA = new Contributor();
contributorA.setCreditName("Abdel-Dayem Khai");
Contributor contributorB = new Contributor();
contributorB.setCreditName("Abdel-Dayem Fake");
List<Contributor> contributorList = new ArrayList<>();
contributorList.add(contributorA);
contributorList.add(contributorB);
int matchCounter2 = 0;
List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
contributorList
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
am.simpleMatch(c.getCreditName(), authorX.getOtherName())) {
int currentCounter = matchCounters2.get(0);
currentCounter += 1;
matchCounters2.set(0, currentCounter);
c.setSimpleMatch(true);
}
});
assertTrue(matchCounters2.get(0) == 2);
assertTrue(contributorList.get(0).isSimpleMatch());
assertTrue(contributorList.get(1).isSimpleMatch());
Optional<Contributor> optCon = contributorList
.stream()
.filter(c -> c.isSimpleMatch())
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
return c;
})
.filter(c -> c.getScore() >= AuthorMatcher.threshold)
.max(Comparator.comparing(c -> c.getScore()));
assertTrue(optCon.isPresent());
final Contributor bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
assertTrue(contributorList.get(0).isBestMatch());
assertTrue(!contributorList.get(1).isBestMatch());
am.updateAuthorsSimilarityMatch(contributorList, authorX);
assertTrue(contributorList.get(0).getName().equals(nameA));
assertTrue(contributorList.get(0).getSurname().equals(surnameA));
assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
}
@Test
public void authorBestMatchTest() throws Exception {
String name = "Khairy";
String surname = "Abdel Dayem";
String orcidWork = "activity_work_0000-0003-2760-1191.xml";
AuthorData author = new AuthorData();
author.setName(name);
author.setSurname(surname);
author.setOid(orcidIdA);
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
AuthorMatcher.match(author, workData.getContributors());
assertTrue(workData.getContributors().size() == 5);
List<Contributor> c = workData.getContributors();
assertTrue(c.get(0).getName().equals(name));
assertTrue(c.get(0).getSurname().equals(surname));
assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
assertTrue(c.get(0).getOid().equals(orcidIdA));
}
}

View File

@ -68,7 +68,7 @@
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:credit-name>Abdel-Dayem K</work:credit-name>
<work:credit-name>Khair Abde Daye</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>