package eu.dnetlib.doiboost.orcidnodoi.xml; import com.ximpleware.NavException; import com.ximpleware.ParseException; import com.ximpleware.XPathEvalException; import com.ximpleware.XPathParseException; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import jdk.nashorn.internal.ir.annotations.Ignore; import org.apache.commons.io.IOUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.text.Normalizer; import java.util.*; import static org.junit.jupiter.api.Assertions.assertNotNull; public class OrcidNoDoiTest { private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class); String nameA = "Khairy"; String surnameA = "Abdel Dayem"; String otherNameA = "Dayem MKA"; String nameB = "K"; String surnameB = "Abdel-Dayem"; String orcidIdA = "0000-0003-2760-1191"; Double threshold = 0.8; @Test @Ignore private void similarityTest() throws Exception { logger.info("running testSimilarity ...."); logger .info( "JaroWinklerSimilarity: " + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB))); logger .info( "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB))); } @Test @Ignore private void bestMatchTest() throws Exception { logger.info("running bestMatchTest ...."); String contributor = surnameB + ", " + nameB; logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor))); } private static Double bestMatch(String authorSurname, String authorName, String contributor) { logger.debug(authorSurname + " " + authorName + " vs " + contributor); String[] contributorSplitted = contributor.split(" "); if (contributorSplitted.length == 0) { return 0.0; } final String contributorName = contributorSplitted[contributorSplitted.length - 1]; String contributorSurname = ""; if (contributorSplitted.length > 1) { StringJoiner joiner = new StringJoiner(" "); for (int i = 0; i < contributorSplitted.length - 1; i++) { joiner.add(contributorSplitted[i]); } contributorSurname = joiner.toString(); } logger .debug( "contributorName: " + contributorName + " contributorSurname: " + contributorSurname); String authorNameNrm = normalize(authorName); String authorSurnameNrm = normalize(authorSurname); String contributorNameNrm = normalize(contributorName); String contributorSurnameNrm = normalize(contributorSurname); Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); if (sm1.compareTo(sm2) >= 0) { return sm1; } return sm2; } private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); logger .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); return score; } private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) { return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); } private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); } private static String parse(String name, String surname) { return surname + " " + name; } private static String normalize(final String s) { return nfd(s) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError // in case // of large input strings .replaceAll("(\\W)+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") .replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\d)+", " ") .replaceAll("(\\n)+", " ") .trim(); } private static String nfd(final String s) { return Normalizer.normalize(s, Normalizer.Form.NFD); } @Test @Ignore public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); String xml = IOUtils .toString( OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); if (xml == null) { logger.info("Resource not found"); } XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); if (p == null) { logger.info("XMLRecordParserNoDoi null"); } WorkDataNoDoi workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { logger.error("parsing xml", e); } assertNotNull(workData); assertNotNull(workData.getOid()); logger.info("oid: " + workData.getOid()); assertNotNull(workData.getTitles()); logger.info("titles: "); workData.getTitles().forEach(t -> { logger.info(t); }); logger.info("source: " + workData.getSourceName()); logger.info("type: " + workData.getType()); logger.info("urls: "); workData.getUrls().forEach(u -> { logger.info(u); }); logger.info("publication date: "); workData.getPublicationDates().forEach(d -> { logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay()); }); logger.info("external id: "); workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self")); workData.getExtIds().forEach(e -> { logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip()); }); logger.info("contributors: "); workData.getContributors().forEach(c -> { logger .info( c.getName() + " - " + c.getRole() + " - " + c.getSequence()); }); } private void updateRanks(List contributors) { boolean seqFound = false; if (contributors .stream() .filter( c -> c.getRole() != null && c.getSequence() != null && c.getRole().equals("author") && (c.getSequence().equals("first") || c.getSequence().equals("additional"))) .count() > 0) { seqFound = true; logger.info("sequence data found"); } if (!seqFound) { List seqIds = Arrays.asList(0); contributors.forEach(c -> { int currentSeq = seqIds.get(0) + 1; seqIds.set(0, currentSeq); c.setSequence(Integer.toString(seqIds.get(0))); }); } } private void updateAuthorsSimpleMatch(List contributors, AuthorData author) { contributors.forEach(c -> { if (c.isSimpleMatch()) { logger.info("simple match on : " + c.getCreditName()); c.setName(author.getName()); c.setSurname(author.getSurname()); c.setOid(author.getOid()); } }); updateRanks(contributors); } private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { logger.info("inside updateAuthorsSimilarityMatch ..."); contributors.forEach(c -> { logger .info( c.getOid() + " - " + c.getCreditName() + " - " + c.getName() + " - " + c.getSurname() + " - " + c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " + c.isSimpleMatch()); }); contributors .stream() .filter(c -> c.isBestMatch()) .forEach(c -> { logger.info("similarity match on : " + c.getCreditName()); c.setName(author.getName()); c.setSurname(author.getSurname()); c.setOid(author.getOid()); }); updateRanks(contributors); } @Test @Ignore public void authorSimilarityMatchTest() throws Exception { logger.info("running authorSimilarityMatchTest ...."); authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml"); } @Test private void authorSimpleMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); authorMatchTest("activity_work_0000-0003-2760-1191.xml"); } private void authorMatchTest(String orcidWork) throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { AuthorData author = new AuthorData(); author.setName(nameA); author.setSurname(surnameA); author.setOid(orcidIdA); String xml = IOUtils .toString( OrcidNoDoiTest.class.getResourceAsStream(orcidWork)); if (xml == null) { logger.info("Resource not found"); } XMLRecordParserNoDoi p = new XMLRecordParserNoDoi(); if (p == null) { logger.info("XMLRecordParserNoDoi null"); } WorkDataNoDoi workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { logger.error("parsing xml", e); } assertNotNull(workData); int matchCounter = 0; List matchCounters = Arrays.asList(matchCounter); Contributor contributor = null; workData.getContributors().forEach(c -> { if (normalize(c.getCreditName()).contains(normalize(author.getName())) || normalize(c.getCreditName()).contains(normalize(author.getSurname())) || ((author.getOtherName() != null) && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { matchCounters.set(0, matchCounters.get(0) + 1); c.setSimpleMatch(true); } }); logger.info("match counter: " + Integer.toString(matchCounters.get(0))); if (matchCounters.get(0) == 1) { updateAuthorsSimpleMatch(workData.getContributors(), author); } else if (matchCounters.get(0) > 1) { Optional optCon = workData .getContributors() .stream() .filter(c -> c.isSimpleMatch()) .map(c -> { c.setScore(bestMatch(nameA, surnameA, c.getCreditName())); logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); return c; }) .filter(c -> c.getScore() >= threshold) .max(Comparator.comparing(c -> c.getScore())); Contributor bestMatchContributor = null; if (optCon.isPresent()) { bestMatchContributor = optCon.get(); bestMatchContributor.setBestMatch(true); logger.info("best match: " + bestMatchContributor.getCreditName()); updateAuthorsSimilarityMatch(workData.getContributors(), author); } } logger.info("UPDATED contributors: "); workData.getContributors().forEach(c -> { logger .info( c.getOid() + " - " + c.getCreditName() + " - " + c.getName() + " - " + c.getSurname() + " - " + c.getRole() + " - " + c.getSequence()); }); } } // // orcid_RDD = sc.textFile(ORCID_DUMP_PATH) // no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") //