From 1513174d7ec367222c063ba47095ee7ca4897e99 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 10 Nov 2020 11:44:55 +0100 Subject: [PATCH] added further test case --- .../SparkGenEnrichedOrcidWorks.java | 2 +- .../orcidnodoi/similarity/AuthorMatcher.java | 50 +++-- .../orcidnodoi/xml/OrcidNoDoiTest.java | 181 ++++++++++++++++-- .../xml/activity_work_0000-0003-2760-1191.xml | 2 +- 4 files changed, 202 insertions(+), 33 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 691ca3eee2..40cd212daa 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -96,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks { Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .filter(Objects::nonNull) .toJavaRDD(); -// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); + enrichedWorksRDD.saveAsTextFile(workingPath + "enrichedWorksText/"); logger.info("Enriched works RDD ready."); final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 6a1468f4c8..2f86820fbe 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -33,7 +33,7 @@ import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; public class AuthorMatcher { private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); - private static final Double threshold = 0.8; + public static final Double threshold = 0.8; public static void match(AuthorData author, List contributors) throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { @@ -41,16 +41,35 @@ public class AuthorMatcher { int matchCounter = 0; List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; - contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> { - if (simpleMatch(c.getCreditName(), author.getName()) || - simpleMatch(c.getCreditName(), author.getSurname()) || - simpleMatch(c.getCreditName(), author.getOtherName())) { - matchCounters.set(0, matchCounters.get(0) + 1); - c.setSimpleMatch(true); - } - }); + contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (simpleMatch(c.getCreditName(), author.getName()) || + simpleMatch(c.getCreditName(), author.getSurname()) || + simpleMatch(c.getCreditName(), author.getOtherName())) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); if (matchCounters.get(0) == 1) { updateAuthorsSimpleMatch(contributors, author); + } else if (matchCounters.get(0) == 0) { + Optional optCon = contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .map(c -> { + c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + updateAuthorsSimilarityMatch(contributors, author); + } } else if (matchCounters.get(0) > 1) { Optional optCon = contributors .stream() @@ -68,19 +87,18 @@ public class AuthorMatcher { bestMatchContributor.setBestMatch(true); updateAuthorsSimilarityMatch(contributors, author); } - } } - private static boolean simpleMatch(String name, String searchValue) { + public static boolean simpleMatch(String name, String searchValue) { if (searchValue == null) { return false; } return normalize(name).contains(normalize(searchValue)); } - private static Double bestMatch(String authorSurname, String authorName, String contributor) { + public static Double bestMatch(String authorSurname, String authorName, String contributor) { String[] contributorSplitted = contributor.split(" "); if (contributorSplitted.length == 0) { return 0.0; @@ -106,7 +124,7 @@ public class AuthorMatcher { return sm2; } - private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); return score; } @@ -115,7 +133,7 @@ public class AuthorMatcher { return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); } - private static String normalize(final String s) { + public static String normalize(final String s) { if (s == null) { return new String(""); } @@ -140,7 +158,7 @@ public class AuthorMatcher { return surname + " " + name; } - private static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + public static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { contributors.forEach(c -> { if (c.isSimpleMatch()) { c.setName(author.getName()); @@ -151,7 +169,7 @@ public class AuthorMatcher { updateRanks(contributors); } - private static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + public static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { contributors .stream() .filter(c -> c.isBestMatch()) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index fa2980ac4c..c2c4ed5e1f 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -38,12 +38,9 @@ public class OrcidNoDoiTest { private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class); - String nameA = "Khairy"; - String surnameA = "Abdel Dayem"; - String otherNameA = "Dayem MKA"; - String nameB = "K"; - String surnameB = "Abdel-Dayem"; - String orcidIdA = "0000-0003-2760-1191"; + static String nameA = "Khairy"; + static String surnameA = "Abdel Dayem"; + static String orcidIdA = "0000-0003-2760-1191"; @Test public void readPublicationFieldsTest() @@ -99,7 +96,7 @@ public class OrcidNoDoiTest { } @Test - public void authorMatchTest() throws Exception { + public void authorDoubleMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); @@ -129,16 +126,8 @@ public class OrcidNoDoiTest { assertTrue(a.getCreditName().equals("Abdel-Dayem K")); AuthorMatcher.match(author, workData.getContributors()); - GsonBuilder builder = new GsonBuilder(); - Gson gson = builder.create(); - logger.info(gson.toJson(workData)); assertTrue(workData.getContributors().size() == 6); - Contributor c = workData.getContributors().get(0); - assertTrue(c.getOid().equals("0000-0003-2760-1191")); - assertTrue(c.getName().equals("Khairy")); - assertTrue(c.getSurname().equals("Abdel Dayem")); - assertTrue(c.getCreditName().equals("Abdel-Dayem K")); } @Test @@ -180,4 +169,166 @@ public class OrcidNoDoiTest { assertTrue(workData.getContributors().get(4).getSequence().equals("seq4")); assertTrue(workData.getContributors().get(4).getRole().equals("role4")); } + + @Test + public void authorSimpleMatchTest() throws Exception { + String orcidWork = "activity_work_0000-0002-5982-8983.xml"; + AuthorData author = new AuthorData(); + author.setName("Parkhouse"); + author.setSurname("H."); + author.setOid("0000-0002-5982-8983"); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + assertNotNull(workData); + + Contributor a = workData.getContributors().get(0); + assertTrue(a.getCreditName().equals("Parkhouse, H.")); + + AuthorMatcher.match(author, workData.getContributors()); + + assertTrue(workData.getContributors().size() == 2); + Contributor c = workData.getContributors().get(0); + assertTrue(c.getOid().equals("0000-0002-5982-8983")); + assertTrue(c.getName().equals("Parkhouse")); + assertTrue(c.getSurname().equals("H.")); + assertTrue(c.getCreditName().equals("Parkhouse, H.")); + } + + @Test + public void match() { + + AuthorData author = new AuthorData(); + author.setName("Joe"); + author.setSurname("Dodge"); + author.setOid("0000-1111-2222-3333"); + Contributor contributor = new Contributor(); + contributor.setCreditName("Joe Dodge"); + List contributors = Arrays.asList(contributor); + AuthorMatcher am = new AuthorMatcher(); + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + contributors + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (am.simpleMatch(c.getCreditName(), author.getName()) || + am.simpleMatch(c.getCreditName(), author.getSurname()) || + am.simpleMatch(c.getCreditName(), author.getOtherName())) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + + assertTrue(matchCounters.get(0) == 1); + am.updateAuthorsSimpleMatch(contributors, author); + assertTrue(contributors.get(0).getName().equals("Joe")); + assertTrue(contributors.get(0).getSurname().equals("Dodge")); + assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge")); + assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333")); + + AuthorData authorX = new AuthorData(); + authorX.setName(nameA); + authorX.setSurname(surnameA); + authorX.setOid(orcidIdA); + Contributor contributorA = new Contributor(); + contributorA.setCreditName("Abdel-Dayem Khai"); + Contributor contributorB = new Contributor(); + contributorB.setCreditName("Abdel-Dayem Fake"); + List contributorList = new ArrayList<>(); + contributorList.add(contributorA); + contributorList.add(contributorB); + int matchCounter2 = 0; + List matchCounters2 = Arrays.asList(matchCounter2); + contributorList + .stream() + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .forEach(c -> { + if (am.simpleMatch(c.getCreditName(), authorX.getName()) || + am.simpleMatch(c.getCreditName(), authorX.getSurname()) || + am.simpleMatch(c.getCreditName(), authorX.getOtherName())) { + int currentCounter = matchCounters2.get(0); + currentCounter += 1; + matchCounters2.set(0, currentCounter); + c.setSimpleMatch(true); + } + }); + + assertTrue(matchCounters2.get(0) == 2); + assertTrue(contributorList.get(0).isSimpleMatch()); + assertTrue(contributorList.get(1).isSimpleMatch()); + + Optional optCon = contributorList + .stream() + .filter(c -> c.isSimpleMatch()) + .filter(c -> !StringUtils.isBlank(c.getCreditName())) + .map(c -> { + c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName())); + return c; + }) + .filter(c -> c.getScore() >= AuthorMatcher.threshold) + .max(Comparator.comparing(c -> c.getScore())); + assertTrue(optCon.isPresent()); + + final Contributor bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai")); + assertTrue(contributorList.get(0).isBestMatch()); + assertTrue(!contributorList.get(1).isBestMatch()); + am.updateAuthorsSimilarityMatch(contributorList, authorX); + assertTrue(contributorList.get(0).getName().equals(nameA)); + assertTrue(contributorList.get(0).getSurname().equals(surnameA)); + assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai")); + assertTrue(contributorList.get(0).getOid().equals(orcidIdA)); + assertTrue(StringUtils.isBlank(contributorList.get(1).getOid())); + } + + @Test + public void authorBestMatchTest() throws Exception { + String name = "Khairy"; + String surname = "Abdel Dayem"; + String orcidWork = "activity_work_0000-0003-2760-1191.xml"; + AuthorData author = new AuthorData(); + author.setName(name); + author.setSurname(surname); + author.setOid(orcidIdA); + String xml = IOUtils + .toString( + OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); + + if (xml == null) { + logger.info("Resource not found"); + } + XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); + if (p == null) { + logger.info("XMLRecordParserNoDoi null"); + } + WorkDataNoDoi workData = null; + try { + workData = p.VTDParseWorkData(xml.getBytes()); + } catch (Exception e) { + logger.error("parsing xml", e); + } + AuthorMatcher.match(author, workData.getContributors()); + assertTrue(workData.getContributors().size() == 5); + List c = workData.getContributors(); + assertTrue(c.get(0).getName().equals(name)); + assertTrue(c.get(0).getSurname().equals(surname)); + assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye")); + assertTrue(c.get(0).getOid().equals(orcidIdA)); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml index 485f4f8e8c..83752b1457 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0003-2760-1191.xml @@ -68,7 +68,7 @@ http://europepmc.org/abstract/med/27899851 - Abdel-Dayem K + Khair Abde Daye first author