orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
13 changed files with 1102 additions and 1 deletions
Showing only changes of commit fcbb4c1489 - Show all commits

View File

@ -84,6 +84,12 @@
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.8</version>

Versions of dependencies should be only declared in the main pom file. Please declare this dependency there (v1.8) and refer to it without overriding the version.

Versions of dependencies should be only declared in the main pom file. Please declare this dependency there (v1.8) and refer to it without overriding the version.

Please remove the dependency version at all from here. The version has to be declared only in the main pom file. Just like all the other dependencies in the same pom file.

Please remove the dependency version at all from here. The version has to be declared *only* in the main pom file. Just like all the other dependencies in the same pom file.

I still see the dependency version declared here. Please move in the project's main pom under the dependencyManagement section.

I still see the dependency version declared here. Please move in the project's main pom under the _dependencyManagement_ section.
</dependency>
</dependencies>

View File

@ -9,6 +9,7 @@ public class AuthorData implements Serializable {
private String name;
private String surname;
private String creditName;
private String otherName;
private String errorCode;
public String getErrorCode() {
@ -50,4 +51,12 @@ public class AuthorData implements Serializable {
public void setOid(String oid) {
this.oid = oid;
}
public String getOtherName() {
return otherName;
}
public void setOtherName(String otherName) {
this.otherName = otherName;
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
import java.io.Serializable;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
public class Contributor extends AuthorData implements Serializable {
private String sequence;
private String role;
private boolean simpleMatch = false;
private Double score = 0.0;
private boolean bestMatch = false;
public String getSequence() {
return sequence;
}
public void setSequence(String sequence) {
this.sequence = sequence;
}
public String getRole() {
return role;
}
public void setRole(String role) {
this.role = role;
}
public boolean isSimpleMatch() {
return simpleMatch;
}
public void setSimpleMatch(boolean simpleMatch) {
this.simpleMatch = simpleMatch;
}
public Double getScore() {
return score;
}
public void setScore(Double score) {
this.score = score;
}
public boolean isBestMatch() {
return bestMatch;
}
public void setBestMatch(boolean bestMatch) {
this.bestMatch = bestMatch;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
public class ExternalId {
private String type;
private String value;
private String relationShip;
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getRelationShip() {
return relationShip;
}
public void setRelationShip(String relationShip) {
this.relationShip = relationShip;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
public class PublicationDate {
private String year;
private String month;
private String day;
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
}

View File

@ -0,0 +1,101 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
import java.io.Serializable;
import java.util.List;
public class WorkDataNoDoi implements Serializable {
private String oid;
private String id;
private String sourceName;
private String type;
private List<String> titles;
private List<String> urls;
List<ExternalId> extIds;
List<PublicationDate> publicationDates;
List<Contributor> contributors;
public String getOid() {
return oid;
}
public void setOid(String oid) {
this.oid = oid;
}
public String getErrorCode() {
return errorCode;
}
public void setErrorCode(String errorCode) {
this.errorCode = errorCode;
}
private String errorCode;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getTitles() {
return titles;
}
public void setTitles(List<String> titles) {
this.titles = titles;
}
public String getSourceName() {
return sourceName;
}
public void setSourceName(String sourceName) {
this.sourceName = sourceName;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public List<String> getUrls() {
return urls;
}
public void setUrls(List<String> urls) {
this.urls = urls;
}
public List<ExternalId> getExtIds() {
return extIds;
}
public void setExtIds(List<ExternalId> extIds) {
this.extIds = extIds;
}
public List<PublicationDate> getPublicationDates() {
return publicationDates;
}
public void setPublicationDates(List<PublicationDate> publicationDates) {
this.publicationDates = publicationDates;
}
public List<Contributor> getContributors() {
return contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
}

View File

@ -0,0 +1,216 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ximpleware.*;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
public class XMLRecordParserNoDoi {
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
private static final String NS_COMMON = "common";
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
private static final String NS_PERSON = "person";
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
private static final String NS_DETAILS = "personal-details";
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
private static final String NS_OTHER = "other-name";
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
private static final String NS_RECORD = "record";
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
private static final String NS_WORK = "work";
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_ERROR = "error";
public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
NavException, XPathEvalException {
logger.info("parsing xml ...");
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
WorkDataNoDoi workData = new WorkDataNoDoi();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) {
workData.setErrorCode(errors.get(0));
return workData;
}
List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code"));
if (!workNodes.isEmpty()) {
final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
workData.setOid(oid);
final String id = (workNodes.get(0).getAttributes().get("put-code"));
workData.setId(id);
} else {
return null;
}
final List<String> titles = VtdUtilityParser
Review

Is the caller expecting the null? Otherwise this would likely produce a NPE.

Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
Review

in this case null value indicate that the original publication in xml format was not parsed, because of mandatory information was not found; there is a check on this null value

in this case null value indicate that the original publication in xml format was not parsed, because of mandatory information was not found; there is a check on this null value
.getTextValue(
ap, vn, "//common:title");
if (!titles.isEmpty()) {
workData.setTitles(titles);
}
final List<String> sourceNames = VtdUtilityParser
.getTextValue(
ap, vn, "//common:source-name");
if (!sourceNames.isEmpty()) {
workData.setSourceName(sourceNames.get(0));
}
final List<String> types = VtdUtilityParser
.getTextValue(
ap, vn, "//work:type");
if (!types.isEmpty()) {
workData.setType(types.get(0));
}
final List<String> urls = VtdUtilityParser
.getTextValue(
ap, vn, "//common:url");
if (!urls.isEmpty()) {
workData.setUrls(urls);
}
workData.setPublicationDates(getPublicationDates(vg, vn, ap));
workData.setExtIds(getExternalIds(vg, vn, ap));
workData.setContributors(getContributors(vg, vn, ap));
return workData;
}
private static List<PublicationDate> getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<PublicationDate> publicationDates = new ArrayList<PublicationDate>();
int yearIndex = 0;
ap.selectXPath("//common:publication-date/common:year");
while (ap.evalXPath() != -1) {
PublicationDate publicationDate = new PublicationDate();
int t = vn.getText();
if (t >= 0) {
publicationDate.setYear(vn.toNormalizedString(t));
publicationDates.add(yearIndex, publicationDate);
yearIndex++;
}
}
int monthIndex = 0;
ap.selectXPath("//common:publication-date/common:month");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t));
monthIndex++;
}
}
int dayIndex = 0;
ap.selectXPath("//common:publication-date/common:day");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t));
dayIndex++;
}
}
return publicationDates;
}
private static List<ExternalId> getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<ExternalId> extIds = new ArrayList<ExternalId>();
int typeIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-type");
while (ap.evalXPath() != -1) {
ExternalId extId = new ExternalId();
int t = vn.getText();
if (t >= 0) {
extId.setType(vn.toNormalizedString(t));
extIds.add(typeIndex, extId);
typeIndex++;
}
}
int valueIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-value");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
extIds.get(valueIndex).setValue(vn.toNormalizedString(t));
valueIndex++;
}
}
int relationshipIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-relationship");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t));
relationshipIndex++;
}
}
if (typeIndex == valueIndex) {
return extIds;
}
return new ArrayList<ExternalId>();
}
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<Contributor> contributors = new ArrayList<Contributor>();
int nameIndex = 0;
ap.selectXPath("//work:contributor/work:credit-name");
while (ap.evalXPath() != -1) {
Contributor contributor = new Contributor();
int t = vn.getText();
if (t >= 0) {
contributor.setCreditName(vn.toNormalizedString(t));
contributors.add(nameIndex, contributor);
nameIndex++;
}
}
int sequenceIndex = 0;
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
sequenceIndex++;
}
}
int roleIndex = 0;
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
roleIndex++;
}
}
return contributors;
}
}

View File

@ -44,7 +44,7 @@ public class XMLRecordParserTest {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml"));
this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
XMLRecordParser p = new XMLRecordParser();

View File

@ -0,0 +1,326 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.apache.commons.io.IOUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class OrcidNoDoiTest {
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
String nameA = "Khairy";
String surnameA = "Abdel Dayem";
String otherNameA = "Dayem MKA";

Why are the tests in this class marked as @Ignore? I see you spent some effort implementing them, I didn't dive much into the details, but I see assertions properly defined so if they are valuable I suggest to keep the test as active.

Why are the tests in this class marked as `@Ignore`? I see you spent some effort implementing them, I didn't dive much into the details, but I see assertions properly defined so if they are valuable I suggest to keep the test as active.
String nameB = "K";
String surnameB = "Abdel-Dayem";
String orcidIdA = "0000-0003-2760-1191";
Double threshold = 0.8;
@Test
@Ignore
private void similarityTest() throws Exception {
logger.info("running testSimilarity ....");
logger
.info(
"JaroWinklerSimilarity: "
+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
logger
.info(
"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
}
@Test
@Ignore
private void bestMatchTest() throws Exception {
logger.info("running bestMatchTest ....");
String contributor = surnameB + ", " + nameB;
logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
}
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
String[] contributorSplitted = contributor.split(" ");
if (contributorSplitted.length == 0) {
return 0.0;
}
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
String contributorSurname = "";
if (contributorSplitted.length > 1) {
StringJoiner joiner = new StringJoiner(" ");
for (int i = 0; i < contributorSplitted.length - 1; i++) {
joiner.add(contributorSplitted[i]);
}
contributorSurname = joiner.toString();
}
logger
.debug(
"contributorName: " + contributorName +
" contributorSurname: " + contributorSurname);
String authorNameNrm = normalize(authorName);
String authorSurnameNrm = normalize(authorSurname);
String contributorNameNrm = normalize(contributorName);
String contributorSurnameNrm = normalize(contributorSurname);
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
if (sm1.compareTo(sm2) >= 0) {
return sm1;
}
return sm2;
}
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
logger
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
return score;
}
private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static String parse(String name, String surname) {
return surname + " " + name;
}
private static String normalize(final String s) {
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
@Test
@Ignore
public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
assertNotNull(workData.getOid());
logger.info("oid: " + workData.getOid());
assertNotNull(workData.getTitles());
logger.info("titles: ");
workData.getTitles().forEach(t -> {
logger.info(t);
});
logger.info("source: " + workData.getSourceName());
logger.info("type: " + workData.getType());
logger.info("urls: ");
workData.getUrls().forEach(u -> {
logger.info(u);
});
logger.info("publication date: ");
workData.getPublicationDates().forEach(d -> {
logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay());
});
logger.info("external id: ");
workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self"));
workData.getExtIds().forEach(e -> {
logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip());
});
logger.info("contributors: ");
workData.getContributors().forEach(c -> {
logger
.info(
c.getName() + " - " + c.getRole() + " - " + c.getSequence());
});
}
private void updateRanks(List<Contributor> contributors) {
boolean seqFound = false;
if (contributors
.stream()
.filter(
c -> c.getRole() != null && c.getSequence() != null &&
c.getRole().equals("author") && (c.getSequence().equals("first") ||
c.getSequence().equals("additional")))
.count() > 0) {
seqFound = true;
logger.info("sequence data found");
}
if (!seqFound) {
List<Integer> seqIds = Arrays.asList(0);
contributors.forEach(c -> {
int currentSeq = seqIds.get(0) + 1;
seqIds.set(0, currentSeq);
c.setSequence(Integer.toString(seqIds.get(0)));
});
}
}
private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
contributors.forEach(c -> {
if (c.isSimpleMatch()) {
logger.info("simple match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
}
});
updateRanks(contributors);
}
private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
logger.info("inside updateAuthorsSimilarityMatch ...");
contributors.forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+ c.isSimpleMatch());
});
contributors
.stream()
.filter(c -> c.isBestMatch())
.forEach(c -> {
logger.info("similarity match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
});
updateRanks(contributors);
}
@Test
@Ignore
public void authorSimilarityMatchTest() throws Exception {
logger.info("running authorSimilarityMatchTest ....");
authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
}
@Test
private void authorSimpleMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ....");
authorMatchTest("activity_work_0000-0003-2760-1191.xml");
}
private void authorMatchTest(String orcidWork)
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
AuthorData author = new AuthorData();
author.setName(nameA);
author.setSurname(surnameA);
author.setOid(orcidIdA);
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
workData.getContributors().forEach(c -> {
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
((author.getOtherName() != null)
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
if (matchCounters.get(0) == 1) {
updateAuthorsSimpleMatch(workData.getContributors(), author);
} else if (matchCounters.get(0) > 1) {
Optional<Contributor> optCon = workData
.getContributors()
.stream()
.filter(c -> c.isSimpleMatch())
.map(c -> {
c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
logger.info("best match: " + bestMatchContributor.getCreditName());
updateAuthorsSimilarityMatch(workData.getContributors(), author);
}
}
logger.info("UPDATED contributors: ");
workData.getContributors().forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence());
});
}
}
//
// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
//

View File

@ -0,0 +1,106 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
path="/0000-0003-2760-1191/work/28776099" visibility="public">
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
<common:path>0000-0002-9157-3431</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Europe PubMed Central</common:source-name>
</common:source>
<work:title>
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
ST-Segment-Elevation Myocardial Infarction.</common:title>
</work:title>
<work:citation>
<work:citation-type>formatted-unspecified</work:citation-type>
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
</work:citation>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2016</common:year>
<common:month>11</common:month>
</common:publication-date>
<common:external-ids>
<common:external-id>
<common:external-id-type>pmid</common:external-id-type>
<common:external-id-value>27899851</common:external-id-value>
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>pmc</common:external-id-type>
<common:external-id-value>PMC5126442</common:external-id-value>
<common:external-id-normalized transient="true"
>PMC5126442</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:credit-name>Abdel-Dayem K</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Eweda II</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>El-Sherbiny A</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Dimitry MO</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Nammas W</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
</work:contributors>
</work:work>

View File

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
path="/0000-0003-2760-1191/work/28776099" visibility="public">
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
<common:path>0000-0002-9157-3431</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Europe PubMed Central</common:source-name>
</common:source>
<work:title>
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
ST-Segment-Elevation Myocardial Infarction.</common:title>
</work:title>
<work:citation>
<work:citation-type>formatted-unspecified</work:citation-type>
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
</work:citation>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2016</common:year>
<common:month>11</common:month>
</common:publication-date>
<common:external-ids>
<common:external-id>
<common:external-id-type>pmid</common:external-id-type>
<common:external-id-value>27899851</common:external-id-value>
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>pmc</common:external-id-type>
<common:external-id-value>PMC5126442</common:external-id-value>
<common:external-id-normalized transient="true"
>PMC5126442</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:credit-name>Abdel-Dayem K</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Abdel-Dayem Fake</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Eweda II</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>El-Sherbiny A</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Dimitry MO</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Nammas W</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
</work:contributors>
</work:work>

View File

@ -0,0 +1,106 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<work:work xmlns:address="http://www.orcid.org/ns/address"
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
xmlns:employment="http://www.orcid.org/ns/employment"
xmlns:education="http://www.orcid.org/ns/education"
xmlns:other-name="http://www.orcid.org/ns/other-name"
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
xmlns:funding="http://www.orcid.org/ns/funding"
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
xmlns:service="http://www.orcid.org/ns/service"
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
xmlns:distinction="http://www.orcid.org/ns/distinction"
xmlns:internal="http://www.orcid.org/ns/internal"
xmlns:membership="http://www.orcid.org/ns/membership"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:activities="http://www.orcid.org/ns/activities"
xmlns:qualification="http://www.orcid.org/ns/qualification"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:error="http://www.orcid.org/ns/error"
xmlns:preferences="http://www.orcid.org/ns/preferences"
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
xmlns:work="http://www.orcid.org/ns/work"
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
path="/0000-0003-2760-1191/work/28776099" visibility="public">
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
<common:path>0000-0002-9157-3431</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Europe PubMed Central</common:source-name>
</common:source>
<work:title>
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
ST-Segment-Elevation Myocardial Infarction.</common:title>
</work:title>
<work:citation>
<work:citation-type>formatted-unspecified</work:citation-type>
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
</work:citation>
<work:type>journal-article</work:type>
<common:publication-date>
<common:year>2016</common:year>
<common:month>11</common:month>
</common:publication-date>
<common:external-ids>
<common:external-id>
<common:external-id-type>pmid</common:external-id-type>
<common:external-id-value>27899851</common:external-id-value>
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
<common:external-id>
<common:external-id-type>pmc</common:external-id-type>
<common:external-id-value>PMC5126442</common:external-id-value>
<common:external-id-normalized transient="true"
>PMC5126442</common:external-id-normalized>
<common:external-id-relationship>self</common:external-id-relationship>
</common:external-id>
</common:external-ids>
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
<work:contributors>
<work:contributor>
<work:credit-name>Abdel-Dayem K</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Eweda II</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>El-Sherbiny A</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Dimitry MO</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
<work:contributor>
<work:credit-name>Nammas W</work:credit-name>
<work:contributor-attributes>
<work:contributor-sequence>first</work:contributor-sequence>
<work:contributor-role>author</work:contributor-role>
</work:contributor-attributes>
</work:contributor>
</work:contributors>
</work:work>