forked from antonis.lempesis/dnet-hadoop
added further test case
This commit is contained in:
parent
6bc7dbeca7
commit
1513174d7e
|
@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD();
|
||||
// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||
enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/");
|
||||
logger.info("Enriched works RDD ready.");
|
||||
|
||||
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
|
||||
|
|
|
@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
|||
public class AuthorMatcher {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||
private static final Double threshold = 0.8;
|
||||
public static final Double threshold = 0.8;
|
||||
|
||||
public static void match(AuthorData author, List<Contributor> contributors)
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
|
@ -41,7 +41,10 @@ public class AuthorMatcher {
|
|||
int matchCounter = 0;
|
||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||
Contributor contributor = null;
|
||||
contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
|
||||
contributors
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.forEach(c -> {
|
||||
if (simpleMatch(c.getCreditName(), author.getName()) ||
|
||||
simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||
simpleMatch(c.getCreditName(), author.getOtherName())) {
|
||||
|
@ -51,6 +54,22 @@ public class AuthorMatcher {
|
|||
});
|
||||
if (matchCounters.get(0) == 1) {
|
||||
updateAuthorsSimpleMatch(contributors, author);
|
||||
} else if (matchCounters.get(0) == 0) {
|
||||
Optional<Contributor> optCon = contributors
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.map(c -> {
|
||||
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||
return c;
|
||||
})
|
||||
.filter(c -> c.getScore() >= threshold)
|
||||
.max(Comparator.comparing(c -> c.getScore()));
|
||||
Contributor bestMatchContributor = null;
|
||||
if (optCon.isPresent()) {
|
||||
bestMatchContributor = optCon.get();
|
||||
bestMatchContributor.setBestMatch(true);
|
||||
updateAuthorsSimilarityMatch(contributors, author);
|
||||
}
|
||||
} else if (matchCounters.get(0) > 1) {
|
||||
Optional<Contributor> optCon = contributors
|
||||
.stream()
|
||||
|
@ -68,19 +87,18 @@ public class AuthorMatcher {
|
|||
bestMatchContributor.setBestMatch(true);
|
||||
updateAuthorsSimilarityMatch(contributors, author);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean simpleMatch(String name, String searchValue) {
|
||||
public static boolean simpleMatch(String name, String searchValue) {
|
||||
if (searchValue == null) {
|
||||
return false;
|
||||
}
|
||||
return normalize(name).contains(normalize(searchValue));
|
||||
}
|
||||
|
||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||
public static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||
String[] contributorSplitted = contributor.split(" ");
|
||||
if (contributorSplitted.length == 0) {
|
||||
return 0.0;
|
||||
|
@ -106,7 +124,7 @@ public class AuthorMatcher {
|
|||
return sm2;
|
||||
}
|
||||
|
||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||
return score;
|
||||
}
|
||||
|
@ -115,7 +133,7 @@ public class AuthorMatcher {
|
|||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||
}
|
||||
|
||||
private static String normalize(final String s) {
|
||||
public static String normalize(final String s) {
|
||||
if (s == null) {
|
||||
return new String("");
|
||||
}
|
||||
|
@ -140,7 +158,7 @@ public class AuthorMatcher {
|
|||
return surname + " " + name;
|
||||
}
|
||||
|
||||
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||
public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||
contributors.forEach(c -> {
|
||||
if (c.isSimpleMatch()) {
|
||||
c.setName(author.getName());
|
||||
|
@ -151,7 +169,7 @@ public class AuthorMatcher {
|
|||
updateRanks(contributors);
|
||||
}
|
||||
|
||||
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||
public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||
contributors
|
||||
.stream()
|
||||
.filter(c -> c.isBestMatch())
|
||||
|
|
|
@ -38,12 +38,9 @@ public class OrcidNoDoiTest {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
|
||||
|
||||
String nameA = "Khairy";
|
||||
String surnameA = "Abdel Dayem";
|
||||
String otherNameA = "Dayem MKA";
|
||||
String nameB = "K";
|
||||
String surnameB = "Abdel-Dayem";
|
||||
String orcidIdA = "0000-0003-2760-1191";
|
||||
static String nameA = "Khairy";
|
||||
static String surnameA = "Abdel Dayem";
|
||||
static String orcidIdA = "0000-0003-2760-1191";
|
||||
|
||||
@Test
|
||||
public void readPublicationFieldsTest()
|
||||
|
@ -99,7 +96,7 @@ public class OrcidNoDoiTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void authorMatchTest() throws Exception {
|
||||
public void authorDoubleMatchTest() throws Exception {
|
||||
logger.info("running authorSimpleMatchTest ....");
|
||||
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||
AuthorData author = new AuthorData();
|
||||
|
@ -129,16 +126,8 @@ public class OrcidNoDoiTest {
|
|||
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
|
||||
|
||||
AuthorMatcher.match(author, workData.getContributors());
|
||||
GsonBuilder builder = new GsonBuilder();
|
||||
Gson gson = builder.create();
|
||||
logger.info(gson.toJson(workData));
|
||||
|
||||
assertTrue(workData.getContributors().size() == 6);
|
||||
Contributor c = workData.getContributors().get(0);
|
||||
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
|
||||
assertTrue(c.getName().equals("Khairy"));
|
||||
assertTrue(c.getSurname().equals("Abdel Dayem"));
|
||||
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -180,4 +169,166 @@ public class OrcidNoDoiTest {
|
|||
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
|
||||
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void authorSimpleMatchTest() throws Exception {
|
||||
String orcidWork = "activity_work_0000-0002-5982-8983.xml";
|
||||
AuthorData author = new AuthorData();
|
||||
author.setName("Parkhouse");
|
||||
author.setSurname("H.");
|
||||
author.setOid("0000-0002-5982-8983");
|
||||
String xml = IOUtils
|
||||
.toString(
|
||||
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||
|
||||
if (xml == null) {
|
||||
logger.info("Resource not found");
|
||||
}
|
||||
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||
if (p == null) {
|
||||
logger.info("XMLRecordParserNoDoi null");
|
||||
}
|
||||
WorkDataNoDoi workData = null;
|
||||
try {
|
||||
workData = p.VTDParseWorkData(xml.getBytes());
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml", e);
|
||||
}
|
||||
assertNotNull(workData);
|
||||
|
||||
Contributor a = workData.getContributors().get(0);
|
||||
assertTrue(a.getCreditName().equals("Parkhouse, H."));
|
||||
|
||||
AuthorMatcher.match(author, workData.getContributors());
|
||||
|
||||
assertTrue(workData.getContributors().size() == 2);
|
||||
Contributor c = workData.getContributors().get(0);
|
||||
assertTrue(c.getOid().equals("0000-0002-5982-8983"));
|
||||
assertTrue(c.getName().equals("Parkhouse"));
|
||||
assertTrue(c.getSurname().equals("H."));
|
||||
assertTrue(c.getCreditName().equals("Parkhouse, H."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void match() {
|
||||
|
||||
AuthorData author = new AuthorData();
|
||||
author.setName("Joe");
|
||||
author.setSurname("Dodge");
|
||||
author.setOid("0000-1111-2222-3333");
|
||||
Contributor contributor = new Contributor();
|
||||
contributor.setCreditName("Joe Dodge");
|
||||
List<Contributor> contributors = Arrays.asList(contributor);
|
||||
AuthorMatcher am = new AuthorMatcher();
|
||||
int matchCounter = 0;
|
||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||
contributors
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.forEach(c -> {
|
||||
if (am.simpleMatch(c.getCreditName(), author.getName()) ||
|
||||
am.simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||
am.simpleMatch(c.getCreditName(), author.getOtherName())) {
|
||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||
c.setSimpleMatch(true);
|
||||
}
|
||||
});
|
||||
|
||||
assertTrue(matchCounters.get(0) == 1);
|
||||
am.updateAuthorsSimpleMatch(contributors, author);
|
||||
assertTrue(contributors.get(0).getName().equals("Joe"));
|
||||
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
|
||||
assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
|
||||
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
|
||||
|
||||
AuthorData authorX = new AuthorData();
|
||||
authorX.setName(nameA);
|
||||
authorX.setSurname(surnameA);
|
||||
authorX.setOid(orcidIdA);
|
||||
Contributor contributorA = new Contributor();
|
||||
contributorA.setCreditName("Abdel-Dayem Khai");
|
||||
Contributor contributorB = new Contributor();
|
||||
contributorB.setCreditName("Abdel-Dayem Fake");
|
||||
List<Contributor> contributorList = new ArrayList<>();
|
||||
contributorList.add(contributorA);
|
||||
contributorList.add(contributorB);
|
||||
int matchCounter2 = 0;
|
||||
List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
|
||||
contributorList
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.forEach(c -> {
|
||||
if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
|
||||
am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
|
||||
am.simpleMatch(c.getCreditName(), authorX.getOtherName())) {
|
||||
int currentCounter = matchCounters2.get(0);
|
||||
currentCounter += 1;
|
||||
matchCounters2.set(0, currentCounter);
|
||||
c.setSimpleMatch(true);
|
||||
}
|
||||
});
|
||||
|
||||
assertTrue(matchCounters2.get(0) == 2);
|
||||
assertTrue(contributorList.get(0).isSimpleMatch());
|
||||
assertTrue(contributorList.get(1).isSimpleMatch());
|
||||
|
||||
Optional<Contributor> optCon = contributorList
|
||||
.stream()
|
||||
.filter(c -> c.isSimpleMatch())
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.map(c -> {
|
||||
c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
|
||||
return c;
|
||||
})
|
||||
.filter(c -> c.getScore() >= AuthorMatcher.threshold)
|
||||
.max(Comparator.comparing(c -> c.getScore()));
|
||||
assertTrue(optCon.isPresent());
|
||||
|
||||
final Contributor bestMatchContributor = optCon.get();
|
||||
bestMatchContributor.setBestMatch(true);
|
||||
assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
|
||||
assertTrue(contributorList.get(0).isBestMatch());
|
||||
assertTrue(!contributorList.get(1).isBestMatch());
|
||||
am.updateAuthorsSimilarityMatch(contributorList, authorX);
|
||||
assertTrue(contributorList.get(0).getName().equals(nameA));
|
||||
assertTrue(contributorList.get(0).getSurname().equals(surnameA));
|
||||
assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
|
||||
assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
|
||||
assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void authorBestMatchTest() throws Exception {
|
||||
String name = "Khairy";
|
||||
String surname = "Abdel Dayem";
|
||||
String orcidWork = "activity_work_0000-0003-2760-1191.xml";
|
||||
AuthorData author = new AuthorData();
|
||||
author.setName(name);
|
||||
author.setSurname(surname);
|
||||
author.setOid(orcidIdA);
|
||||
String xml = IOUtils
|
||||
.toString(
|
||||
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||
|
||||
if (xml == null) {
|
||||
logger.info("Resource not found");
|
||||
}
|
||||
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||
if (p == null) {
|
||||
logger.info("XMLRecordParserNoDoi null");
|
||||
}
|
||||
WorkDataNoDoi workData = null;
|
||||
try {
|
||||
workData = p.VTDParseWorkData(xml.getBytes());
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml", e);
|
||||
}
|
||||
AuthorMatcher.match(author, workData.getContributors());
|
||||
assertTrue(workData.getContributors().size() == 5);
|
||||
List<Contributor> c = workData.getContributors();
|
||||
assertTrue(c.get(0).getName().equals(name));
|
||||
assertTrue(c.get(0).getSurname().equals(surname));
|
||||
assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
|
||||
assertTrue(c.get(0).getOid().equals(orcidIdA));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@
|
|||
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||
<work:contributors>
|
||||
<work:contributor>
|
||||
<work:credit-name>Abdel-Dayem K</work:credit-name>
|
||||
<work:credit-name>Khair Abde Daye</work:credit-name>
|
||||
<work:contributor-attributes>
|
||||
<work:contributor-sequence>first</work:contributor-sequence>
|
||||
<work:contributor-role>author</work:contributor-role>
|
||||
|
|
Loading…
Reference in New Issue