From 6ce36b3e412020619afd8a242d863ec3a4f632b5 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 14 Nov 2023 12:04:29 +0100 Subject: [PATCH] Implemented ORCID Workflow on DHP-Aggregation for retrieving ORCID DUMP and generating tables --- .../orcid/DownloadORCIDDumpApplication.java | 102 +++ .../collection/orcid/ExtractORCIDDump.java | 71 +++ .../dhp/collection/orcid/ORCIDExtractor.java | 171 ++++++ .../dhp/collection/orcid/OrcidParser.java | 251 ++++++++ .../dhp/collection/orcid/model/Author.java | 83 +++ .../collection/orcid/model/Employment.java | 54 ++ .../dhp/collection/orcid/model/ORCIDItem.java | 14 + .../dhp/collection/orcid/model/Pid.java | 33 + .../dhp/collection/orcid/model/Work.java | 35 ++ .../orcid/download_orcid_parameter.json | 21 + .../orcid/extract_orcid_parameter.json | 21 + .../orcid/generate_orcid_table_parameter.json | 21 + .../orcid/oozie_app/config-default.xml | 23 + .../collection/orcid/oozie_app/workflow.xml | 81 +++ .../preprocess_orcid_dump_parameter.json | 21 + .../orcid/SparkGenerateORCIDTable.scala | 101 +++ .../collection/orcid/DownloadORCIDTest.java | 158 +++++ .../activity_work_0000-0002-2536-4498.xml | 69 +++ .../activity_work_0000-0002-5982-8983.xml | 79 +++ ...ty_work_0000-0003-2760-1191-similarity.xml | 113 ++++ .../activity_work_0000-0003-2760-1191.xml | 106 ++++ ..._work_0000-0003-2760-1191_contributors.xml | 101 +++ .../dhp/collection/orcid/employment.xml | 50 ++ .../dhp/collection/orcid/employment_2.xml | 55 ++ .../dhp/collection/orcid/employment_3.xml | 62 ++ .../dnetlib/dhp/collection/orcid/summary.xml | 581 ++++++++++++++++++ 26 files changed, 2477 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDDumpApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ExtractORCIDDump.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidParser.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Employment.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Pid.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/download_orcid_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/extract_orcid_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/generate_orcid_table_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/preprocess_orcid_dump_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/orcid/SparkGenerateORCIDTable.scala create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_2.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_3.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/summary.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDDumpApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDDumpApplication.java new file mode 100644 index 000000000..69661bbf6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDDumpApplication.java @@ -0,0 +1,102 @@ + +package eu.dnetlib.dhp.collection.orcid; + +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; + +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.util.Objects; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class DownloadORCIDDumpApplication { + private static final Logger log = LoggerFactory.getLogger(DownloadORCIDDumpApplication.class); + + private final FileSystem fileSystem; + + public DownloadORCIDDumpApplication(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + DownloadORCIDDumpApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/orcid/download_orcid_parameter.json")))); + argumentParser.parseArgument(args); + + final String hdfsuri = argumentParser.get("namenode"); + log.info("hdfsURI is {}", hdfsuri); + + final String targetPath = argumentParser.get("targetPath"); + log.info("targetPath is {}", targetPath); + + final String apiURL = argumentParser.get("apiURL"); + log.info("apiURL is {}", apiURL); + + final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri)); + + new DownloadORCIDDumpApplication(fileSystem).run(targetPath, apiURL); + + } + + private void downloadItem(final String name, final String itemURL, final String basePath) { + try { + final Path hdfsWritePath = new Path(String.format("%s/%s", basePath, name)); + final FSDataOutputStream fsDataOutputStream = fileSystem.create(hdfsWritePath, true); + final HttpGet request = new HttpGet(itemURL); + final int timeout = 60; // seconds + final RequestConfig config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build(); + log.info("Downloading url {} into {}", itemURL, hdfsWritePath.getName()); + try (CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build(); + CloseableHttpResponse response = client.execute(request)) { + int responseCode = response.getStatusLine().getStatusCode(); + log.info("Response code is {}", responseCode); + if (responseCode >= 200 && responseCode < 400) { + IOUtils.copy(response.getEntity().getContent(), fsDataOutputStream); + } + } catch (Throwable eu) { + throw new RuntimeException(eu); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + protected void run(final String targetPath, final String apiURL) throws Exception { + final ObjectMapper mapper = new ObjectMapper(); + final URL url = new URL(apiURL); + URLConnection conn = url.openConnection(); + InputStream is = conn.getInputStream(); + final String json = IOUtils.toString(is); + JsonNode jsonNode = mapper.readTree(json); + jsonNode + .get("files") + .forEach(i -> downloadItem(i.get("name").asText(), i.get("download_url").asText(), targetPath)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ExtractORCIDDump.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ExtractORCIDDump.java new file mode 100644 index 000000000..4aefdb5e9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ExtractORCIDDump.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.dhp.collection.orcid; + +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class ExtractORCIDDump { + private static final Logger log = LoggerFactory.getLogger(ExtractORCIDDump.class); + + private final FileSystem fileSystem; + + public ExtractORCIDDump(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + DownloadORCIDDumpApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/orcid/extract_orcid_parameter.json")))); + argumentParser.parseArgument(args); + + final String hdfsuri = argumentParser.get("namenode"); + log.info("hdfsURI is {}", hdfsuri); + + final String sourcePath = argumentParser.get("sourcePath"); + log.info("sourcePath is {}", sourcePath); + + final String targetPath = argumentParser.get("targetPath"); + log.info("targetPath is {}", targetPath); + + final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri)); + + new ExtractORCIDDump(fileSystem).run(sourcePath, targetPath); + + } + + public void run(final String sourcePath, final String targetPath) throws IOException, InterruptedException { + RemoteIterator ls = fileSystem.listFiles(new Path(sourcePath), false); + final List workers = new ArrayList<>(); + int i = 0; + while (ls.hasNext()) { + LocatedFileStatus current = ls.next(); + if (current.getPath().getName().endsWith("tar.gz")) { + workers.add(new ORCIDExtractor(fileSystem, "" + i++, current.getPath(), targetPath)); + } + } + workers.forEach(Thread::start); + for (ORCIDExtractor worker : workers) { + worker.join(); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java new file mode 100644 index 000000000..11f4c55d8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java @@ -0,0 +1,171 @@ + +package eu.dnetlib.dhp.collection.orcid; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/**\ + * The ORCIDExtractor class extracts ORCID data from a TAR archive. + * The class creates a map of SequenceFile.Writer objects, one for each type of data that is to be extracted (e.g., employments, works, summaries). + * Then, it iterates over the TAR archive and writes each entry to the appropriate SequenceFile.Writer object. + * Finally, it closes all the SequenceFile.Writer objects. + */ +public class ORCIDExtractor extends Thread { + + private static final Logger log = LoggerFactory.getLogger(ORCIDExtractor.class); + + private final FileSystem fileSystem; + + private final String id; + + private final Path sourcePath; + + private final String baseOutputPath; + + public ORCIDExtractor(FileSystem fileSystem, String id, Path sourcePath, String baseOutputPath) { + this.fileSystem = fileSystem; + this.id = id; + this.sourcePath = sourcePath; + this.baseOutputPath = baseOutputPath; + } + + /** + * creates a map of SequenceFile.Writer objects, + * one for each type of data that is to be extracted. The map is created based on the filename in the TAR archive. + * For example, if the filename is employments.json, the map will contain an entry for the SequenceFile.Writer + * object that writes employment data. + * @return the Map + */ + private Map createMap() { + try { + log.info("Thread {} Creating sequence files starting from this input Path {}", id, sourcePath.getName()); + Map res = new HashMap<>(); + if (sourcePath.getName().contains("summaries")) { + + final String summaryPath = String.format("%s/summaries_%s", baseOutputPath, id); + final SequenceFile.Writer summary_file = SequenceFile + .createWriter( + fileSystem.getConf(), + SequenceFile.Writer.file(new Path(summaryPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + + log.info("Thread {} Creating only summary path here {}", id, summaryPath); + res.put("summary", summary_file); + return res; + } else { + String employmentsPath = String.format("%s/employments_%s", baseOutputPath, id); + final SequenceFile.Writer employments_file = SequenceFile + .createWriter( + fileSystem.getConf(), + SequenceFile.Writer.file(new Path(employmentsPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + res.put("employments", employments_file); + log.info("Thread {} Creating employments path here {}", id, employmentsPath); + + final String worksPath = String.format("%s/works_%s", baseOutputPath, id); + final SequenceFile.Writer works_file = SequenceFile + .createWriter( + fileSystem.getConf(), + SequenceFile.Writer.file(new Path(worksPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + res.put("works", works_file); + log.info("Thread {} Creating works path here {}", id, worksPath); + + return res; + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public void run() { + + CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf()); + CompressionCodec codec = factory.getCodec(sourcePath); + if (codec == null) { + System.err.println("No codec found for " + sourcePath.getName()); + System.exit(1); + } + + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fileSystem.open(sourcePath)); + final Map fileMap = createMap(); + iterateTar(fileMap, gzipInputStream); + + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + log.info("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + + } + + private SequenceFile.Writer retrieveFile(Map fileMap, final String path) { + if (sourcePath.getName().contains("summaries")) { + return fileMap.get("summary"); + } + + if (path.contains("works")) { + return fileMap.get("works"); + } + if (path.contains("employments")) + return fileMap.get("employments"); + return null; + } + + private void iterateTar(Map fileMap, InputStream gzipInputStream) throws IOException { + + int extractedItem = 0; + try (final TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + + TarArchiveEntry entry; + while ((entry = tais.getNextTarEntry()) != null) { + + if (entry.isFile()) { + + final SequenceFile.Writer fl = retrieveFile(fileMap, entry.getName()); + if (fl != null) { + final Text key = new Text(entry.getName()); + final Text value = new Text( + org.apache.commons.io.IOUtils.toString(new BufferedReader(new InputStreamReader(tais)))); + fl.append(key, value); + extractedItem++; + if (extractedItem % 100000 == 0) { + log.info("Thread {}: Extracted {} items", id, extractedItem); + break; + } + } + } + } + } finally { + for (SequenceFile.Writer k : fileMap.values()) { + log.info("Thread {}: Completed processed {} items", id, extractedItem); + k.hflush(); + k.close(); + } + } + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidParser.java new file mode 100644 index 000000000..159b8a5fc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/OrcidParser.java @@ -0,0 +1,251 @@ + +package eu.dnetlib.dhp.collection.orcid; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ximpleware.*; + +import eu.dnetlib.dhp.collection.orcid.model.*; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; + +public class OrcidParser { + + final Logger log = LoggerFactory.getLogger(OrcidParser.class); + private VTDNav vn; + + private AutoPilot ap; + private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; + private static final String NS_COMMON = "common"; + private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; + private static final String NS_PERSON = "person"; + private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; + private static final String NS_DETAILS = "personal-details"; + private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; + private static final String NS_OTHER = "other-name"; + private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; + private static final String NS_RECORD = "record"; + private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + private static final String NS_ACTIVITIES = "activities"; + private static final String NS_ACTIVITIES_URL = "http://www.orcid.org/ns/activities"; + private static final String NS_WORK = "work"; + private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; + + private static final String NS_ERROR = "error"; + private static final String NS_HISTORY = "history"; + private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history"; + private static final String NS_BULK_URL = "http://www.orcid.org/ns/bulk"; + private static final String NS_BULK = "bulk"; + private static final String NS_EXTERNAL = "external-identifier"; + private static final String NS_EXTERNAL_URL = "http://www.orcid.org/ns/external-identifier"; + + private void generateParsedDocument(final String xml) throws ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(xml.getBytes()); + vg.parse(true); + this.vn = vg.getNav(); + this.ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL); + ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL); + ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); + ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL); + ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); + ap.declareXPathNameSpace(NS_EXTERNAL, NS_EXTERNAL_URL); + ap.declareXPathNameSpace(NS_ACTIVITIES, NS_ACTIVITIES_URL); + } + + public Author parseSummary(final String xml) { + + try { + final Author author = new Author(); + generateParsedDocument(xml); + List recordNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//record:record", Arrays.asList("path")); + if (!recordNodes.isEmpty()) { + final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1); + author.setOrcid(oid); + } else { + return null; + } + List personNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//person:name", Arrays.asList("visibility")); + final String visibility = (personNodes.get(0).getAttributes().get("visibility")); + author.setVisibility(visibility); + final String name = VtdUtilityParser.getSingleValue(ap, vn, "//personal-details:given-names"); + author.setGivenName(name); + + final String surnames = VtdUtilityParser.getSingleValue(ap, vn, "//personal-details:family-name"); + author.setFamilyName(surnames); + + final String creditNames = VtdUtilityParser.getSingleValue(ap, vn, "//personal-details:credit-name"); + author.setCreditName(creditNames); + + final String biography = VtdUtilityParser + .getSingleValue(ap, vn, "//person:biography/personal-details:content"); + author.setBiography(biography); + + final List otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content"); + if (!otherNames.isEmpty()) { + author.setOtherNames(otherNames); + } + + ap.selectXPath("//external-identifier:external-identifier"); + + while (ap.evalXPath() != -1) { + final Pid pid = new Pid(); + + final AutoPilot ap1 = new AutoPilot(ap.getNav()); + + ap1.selectXPath("./common:external-id-type"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + pid.setSchema(vn.toNormalizedString(it)); + } + ap1.selectXPath("./common:external-id-value"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + pid.setValue(vn.toNormalizedString(it)); + } + + author.addOtherPid(pid); + } + + return author; + } catch (Throwable e) { + log.error("Error on parsing {}", xml); + log.error(e.getMessage()); + return null; + } + } + + public Work parseWork(final String xml) { + + try { + final Work work = new Work(); + generateParsedDocument(xml); + List workNodes = VtdUtilityParser + .getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "visibility")); + if (!workNodes.isEmpty()) { + final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; + work.setOrcid(oid); + } else { + return null; + } + + ap.selectXPath("//common:external-id"); + + while (ap.evalXPath() != -1) { + final Pid pid = new Pid(); + + final AutoPilot ap1 = new AutoPilot(ap.getNav()); + + ap1.selectXPath("./common:external-id-type"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + pid.setSchema(vn.toNormalizedString(it)); + } + ap1.selectXPath("./common:external-id-value"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + pid.setValue(vn.toNormalizedString(it)); + } + + work.addPid(pid); + } + + work.setTitle(VtdUtilityParser.getSingleValue(ap, vn, "//work:title/common:title")); + + return work; + } catch (Throwable e) { + log.error("Error on parsing {}", xml); + log.error(e.getMessage()); + return null; + } + + } + + private String extractEmploymentDate(final String xpath) throws Exception { + + ap.selectXPath(xpath); + StringBuilder sb = new StringBuilder(); + while (ap.evalXPath() != -1) { + final AutoPilot ap1 = new AutoPilot(ap.getNav()); + ap1.selectXPath("./common:year"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + sb.append(vn.toNormalizedString(it)); + } + ap1.selectXPath("./common:month"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + sb.append("-"); + sb.append(vn.toNormalizedString(it)); + } + ap1.selectXPath("./common:day"); + while (ap1.evalXPath() != -1) { + int it = vn.getText(); + sb.append("-"); + sb.append(vn.toNormalizedString(it)); + } + } + return sb.toString(); + + } + + public Employment parseEmployment(final String xml) { + try { + final Employment employment = new Employment(); + generateParsedDocument(xml); + final String oid = VtdUtilityParser + .getSingleValue(ap, vn, "//common:source-orcid/common:path"); + if (StringUtils.isNotBlank(oid)) { + employment.setOrcid(oid); + } else { + return null; + } + final String depName = VtdUtilityParser + .getSingleValue(ap, vn, "//common:department-name"); + final String rolTitle = VtdUtilityParser + .getSingleValue(ap, vn, "//common:role-title"); + if (StringUtils.isNotBlank(rolTitle)) + employment.setRoleTitle(rolTitle); + if (StringUtils.isNotBlank(depName)) + employment.setDepartmentName(depName); + else + employment + .setDepartmentName( + VtdUtilityParser + .getSingleValue(ap, vn, "//common:organization/common:name")); + + employment.setStartDate(extractEmploymentDate("//common:start-date")); + employment.setEndDate(extractEmploymentDate("//common:end-date")); + + final String affiliationId = VtdUtilityParser + .getSingleValue(ap, vn, "//common:disambiguated-organization-identifier"); + final String affiliationIdType = VtdUtilityParser + .getSingleValue(ap, vn, "//common:disambiguation-source"); + + if (StringUtils.isNotBlank(affiliationId) || StringUtils.isNotBlank(affiliationIdType)) + employment.setAffiliationId(new Pid(affiliationId, affiliationIdType)); + + return employment; + } catch (Throwable e) { + log.error("Error on parsing {}", xml); + log.error(e.getMessage()); + return null; + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java new file mode 100644 index 000000000..32c321b41 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Author.java @@ -0,0 +1,83 @@ + +package eu.dnetlib.dhp.collection.orcid.model; + +import java.util.ArrayList; +import java.util.List; + +public class Author extends ORCIDItem { + private String givenName; + private String familyName; + + private String visibility; + + private String creditName; + + private List otherNames; + + private List otherPids; + + private String biography; + + public String getBiography() { + return biography; + } + + public void setBiography(String biography) { + this.biography = biography; + } + + public String getGivenName() { + return givenName; + } + + public void setGivenName(String givenName) { + this.givenName = givenName; + } + + public String getFamilyName() { + return familyName; + } + + public void setFamilyName(String familyName) { + this.familyName = familyName; + } + + public String getCreditName() { + return creditName; + } + + public void setCreditName(String creditName) { + this.creditName = creditName; + } + + public List getOtherNames() { + return otherNames; + } + + public void setOtherNames(List otherNames) { + this.otherNames = otherNames; + } + + public String getVisibility() { + return visibility; + } + + public void setVisibility(String visibility) { + this.visibility = visibility; + } + + public List getOtherPids() { + return otherPids; + } + + public void setOtherPids(List otherPids) { + this.otherPids = otherPids; + } + + public void addOtherPid(final Pid pid) { + + if (otherPids == null) + otherPids = new ArrayList<>(); + otherPids.add(pid); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Employment.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Employment.java new file mode 100644 index 000000000..baee67d46 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Employment.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.dhp.collection.orcid.model; + +public class Employment extends ORCIDItem { + + private String startDate; + private String EndDate; + + private Pid affiliationId; + + private String departmentName; + + private String roleTitle; + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return EndDate; + } + + public void setEndDate(String endDate) { + EndDate = endDate; + } + + public Pid getAffiliationId() { + return affiliationId; + } + + public void setAffiliationId(Pid affiliationId) { + this.affiliationId = affiliationId; + } + + public String getDepartmentName() { + return departmentName; + } + + public void setDepartmentName(String departmentName) { + this.departmentName = departmentName; + } + + public String getRoleTitle() { + return roleTitle; + } + + public void setRoleTitle(String roleTitle) { + this.roleTitle = roleTitle; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java new file mode 100644 index 000000000..6bc47bc26 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/ORCIDItem.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.collection.orcid.model; + +public class ORCIDItem { + private String orcid; + + public String getOrcid() { + return orcid; + } + + public void setOrcid(String orcid) { + this.orcid = orcid; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Pid.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Pid.java new file mode 100644 index 000000000..077dc2550 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Pid.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.dhp.collection.orcid.model; + +public class Pid { + + private String value; + + private String schema; + + public Pid() { + } + + public Pid(String value, String schema) { + this.value = value; + this.schema = schema; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public String getSchema() { + return schema; + } + + public void setSchema(String schema) { + this.schema = schema; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java new file mode 100644 index 000000000..670170323 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/model/Work.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.collection.orcid.model; + +import java.util.ArrayList; +import java.util.List; + +public class Work extends ORCIDItem { + + private String title; + + private List pids; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public List getPids() { + return pids; + } + + public void setPids(List pids) { + this.pids = pids; + } + + public void addPid(Pid pid) { + if (pids == null) + pids = new ArrayList<>(); + pids.add(pid); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/download_orcid_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/download_orcid_parameter.json new file mode 100644 index 000000000..4a84cbbb9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/download_orcid_parameter.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target PATH where download the files", + "paramRequired": true + }, + { + "paramName": "a", + "paramLongName": "apiURL", + "paramDescription": "the FIGSHARE API id URL to retrieve all the dump files", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/extract_orcid_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/extract_orcid_parameter.json new file mode 100644 index 000000000..4af371875 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/extract_orcid_parameter.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target PATH to extract files", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the PATH where the tar.gz files were downloaded", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/generate_orcid_table_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/generate_orcid_table_parameter.json new file mode 100644 index 000000000..01d81ea97 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/generate_orcid_table_parameter.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target PATH of the DF tables", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the PATH of the ORCID sequence file", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/workflow.xml new file mode 100644 index 000000000..1a5f425e5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/oozie_app/workflow.xml @@ -0,0 +1,81 @@ + + + + targetPath + the path to store the original ORCID dump + + + apiURL + The figshare API URL to retrieve the list file to download + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.collection.orcid.DownloadORCIDDumpApplication + --namenode${nameNode} + --targetPath${targetPath} + --apiURL${apiURL} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + + eu.dnetlib.dhp.collection.orcid.ExtractORCIDDump + -Xmx6g + --namenode${nameNode} + --sourcePath${targetPath} + --targetPath${targetPath}/extracted + + + + + + + + yarn + cluster + Generate ORCID Tables + eu.dnetlib.dhp.collection.orcid.SparkGenerateORCIDTable + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=2g + --conf spark.sql.shuffle.partitions=3000 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${targetPath}/extracted + --targetPath${targetPath}/tables + --masteryarn + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/preprocess_orcid_dump_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/preprocess_orcid_dump_parameter.json new file mode 100644 index 000000000..4a84cbbb9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/orcid/preprocess_orcid_dump_parameter.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target PATH where download the files", + "paramRequired": true + }, + { + "paramName": "a", + "paramLongName": "apiURL", + "paramDescription": "the FIGSHARE API id URL to retrieve all the dump files", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/orcid/SparkGenerateORCIDTable.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/orcid/SparkGenerateORCIDTable.scala new file mode 100644 index 000000000..f0c4cd214 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/orcid/SparkGenerateORCIDTable.scala @@ -0,0 +1,101 @@ +package eu.dnetlib.dhp.collection.orcid + +import eu.dnetlib.dhp.application.AbstractScalaApplication +import eu.dnetlib.dhp.collection.orcid.model.{Author, Employment, Pid, Work} +import org.apache.hadoop.io.Text +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +class SparkGenerateORCIDTable(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + val sourcePath: String = parser.get("sourcePath") + log.info("found parameters sourcePath: {}", sourcePath) + val targetPath: String = parser.get("targetPath") + log.info("found parameters targetPath: {}", targetPath) + extractORCIDTable(spark, sourcePath, targetPath) + extractORCIDEmploymentsTable(spark, sourcePath, targetPath) + extractORCIDWorksTable(spark, sourcePath, targetPath) + } + + def extractORCIDTable(spark: SparkSession, sourcePath: String, targetPath: String): Unit = { + val sc: SparkContext = spark.sparkContext + import spark.implicits._ + val df = sc + .sequenceFile(sourcePath, classOf[Text], classOf[Text]) + .map { case (x, y) => (x.toString, y.toString) } + .toDF + .as[(String, String)] + implicit val orcidAuthor: Encoder[Author] = Encoders.bean(classOf[Author]) +// implicit val orcidPID:Encoder[Pid] = Encoders.bean(classOf[Pid]) + df.filter(r => r._1.contains("summaries")) + .map { r => + val p = new OrcidParser + p.parseSummary(r._2) + } + .filter(p => p != null) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/Authors") + } + + def extractORCIDWorksTable(spark: SparkSession, sourcePath: String, targetPath: String): Unit = { + val sc: SparkContext = spark.sparkContext + import spark.implicits._ + val df = sc + .sequenceFile(sourcePath, classOf[Text], classOf[Text]) + .map { case (x, y) => (x.toString, y.toString) } + .toDF + .as[(String, String)] + implicit val orcidWorkAuthor: Encoder[Work] = Encoders.bean(classOf[Work]) + implicit val orcidPID: Encoder[Pid] = Encoders.bean(classOf[Pid]) + df.filter(r => r._1.contains("works")) + .map { r => + val p = new OrcidParser + p.parseWork(r._2) + } + .filter(p => p != null) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/Works") + } + + def extractORCIDEmploymentsTable(spark: SparkSession, sourcePath: String, targetPath: String): Unit = { + val sc: SparkContext = spark.sparkContext + import spark.implicits._ + val df = sc + .sequenceFile(sourcePath, classOf[Text], classOf[Text]) + .map { case (x, y) => (x.toString, y.toString) } + .toDF + .as[(String, String)] + implicit val orcidEmploymentAuthor: Encoder[Employment] = Encoders.bean(classOf[Employment]) + implicit val orcidPID: Encoder[Pid] = Encoders.bean(classOf[Pid]) + df.filter(r => r._1.contains("employments")) + .map { r => + val p = new OrcidParser + p.parseEmployment(r._2) + } + .filter(p => p != null) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/Employments") + } +} + +object SparkGenerateORCIDTable { + + val log: Logger = LoggerFactory.getLogger(SparkGenerateORCIDTable.getClass) + + def main(args: Array[String]): Unit = { + + new SparkGenerateORCIDTable("/eu/dnetlib/dhp/collection/orcid/generate_orcid_table_parameter.json", args, log) + .initialize() + .run() + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java new file mode 100644 index 000000000..be5555fc0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java @@ -0,0 +1,158 @@ + +package eu.dnetlib.dhp.collection.orcid; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; + +import eu.dnetlib.dhp.collection.orcid.model.Author; +import eu.dnetlib.dhp.collection.orcid.model.ORCIDItem; +import eu.dnetlib.dhp.parser.utility.VtdException; + +public class DownloadORCIDTest { + private final Logger log = LoggerFactory.getLogger(DownloadORCIDTest.class); + +// public void test() throws Exception { +// +// Configuration conf = new Configuration(); +// // Set FileSystem URI +//// conf.set("fs.defaultFS", "file://"); +// // Because of Maven +// conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); +// conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); +// +// System.setProperty("hadoop.home.dir", "file:///Users/sandro/orcid/"); +// +// final FileSystem fileSystem = FileSystem.get(conf); +// +// new ExtractORCIDDump(fileSystem).run("/Users/sandro/orcid/", "/Users/sandro/orcid/extracted"); +// +//// final GZIPInputStream gzip = new GZIPInputStream(Files.newInputStream(Paths.get("/Users/sandro/orcid/ORCID_2023_10_activities_1.tar.gz"))); +//// try(final TarArchiveInputStream tais = new TarArchiveInputStream(gzip)) { +//// +//// TarArchiveEntry entry; +//// while ((entry = tais.getNextTarEntry()) != null) { +//// +//// if (entry.isFile() && entry.getName().contains("employments")) { +//// +//// System.out.println(entry.getName()); +//// final String [] items = entry.getName().split("/"); +//// +//// final String res = IOUtils.toString(new BufferedReader(new InputStreamReader(tais))); +//// System.out.println("res = " + res); +//// +//// System.out.println(items[items.length-2]); +//// break; +//// } +//// +//// +//// } +//// } +// +// } + + @Test + public void testSummary() throws Exception { + final String xml = IOUtils + .toString( + Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/orcid/summary.xml"))); + + final OrcidParser parser = new OrcidParser(); + ORCIDItem orcidItem = parser.parseSummary(xml); + + final ObjectMapper mapper = new ObjectMapper(); + System.out.println(mapper.writeValueAsString(orcidItem)); + + } + + @Test + public void testParsingWork() throws Exception { + + final List works_path = Arrays + .asList( + "/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml", + "/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml", + "/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml", + "/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml", + "/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml" + + ); + + final OrcidParser parser = new OrcidParser(); + final ObjectMapper mapper = new ObjectMapper(); + works_path.stream().map(s -> { + try { + return IOUtils + .toString( + Objects + .requireNonNull( + getClass() + .getResourceAsStream( + s))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).forEach(s -> { + try { + System.out.println(mapper.writeValueAsString(parser.parseWork(s))); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + @Test + public void testParsingEmployments() throws Exception { + + final List works_path = Arrays + .asList( + "/eu/dnetlib/dhp/collection/orcid/employment.xml", + "/eu/dnetlib/dhp/collection/orcid/employment_2.xml", + "/eu/dnetlib/dhp/collection/orcid/employment_3.xml" + + ); + + final OrcidParser parser = new OrcidParser(); + final ObjectMapper mapper = new ObjectMapper(); + works_path.stream().map(s -> { + try { + return IOUtils + .toString( + Objects + .requireNonNull( + getClass() + .getResourceAsStream( + s))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).forEach(s -> { + try { + System.out.println(mapper.writeValueAsString(parser.parseEmployment(s))); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml new file mode 100644 index 000000000..2c89b83f6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml @@ -0,0 +1,69 @@ + + + 2016-09-01T19:22:46.768Z + 2022-05-25T03:48:56.968Z + + + https://orcid.org/client/0000-0002-5982-8983 + 0000-0002-5982-8983 + orcid.org + + Scopus - Elsevier + + https://orcid.org/0000-0001-5010-5001 + 0000-0001-5010-5001 + orcid.org + + Quang Nguyen + + + Vision outcomes and major complications after endovascular coil embolization of ophthalmic segment aneurysms + + American Journal of Neuroradiology + + bibtex + @article{Nguyen2014,title = {Vision outcomes and major complications after endovascular coil embolization of ophthalmic segment aneurysms},journal = {American Journal of Neuroradiology},year = {2014},volume = {35},number = {11},pages = {2140-2145},author = {Durst, C. and Starke, R.M. and Gaughen, J. and Nguyen, Q. and Patrie, J. and Jensen, M.E. and Evans, A.J.}} + + journal-article + + 2014 + + + + doi + 10.3174/ajnr.A4032 + 10.3174/ajnr.a4032 + self + + + eid + 2-s2.0-84911865199 + 2-s2.0-84911865199 + self + + + http://www.scopus.com/inward/record.url?eid=2-s2.0-84911865199&partnerID=MN8TOARS + + + Durst, C. + + + Starke, R.M. + + + Gaughen, J. + + + Nguyen, Q. + + + Patrie, J. + + + Jensen, M.E. + + + Evans, A.J. + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml new file mode 100644 index 000000000..63b4405f1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml @@ -0,0 +1,79 @@ + + + 2018-11-01T19:49:45.562Z + 2018-11-01T19:49:45.562Z + + + https://orcid.org/client/0000-0002-5982-8983 + 0000-0002-5982-8983 + orcid.org + + Scopus - Elsevier + + + "Calling Out" in class: Degrees of candor in addressing social injustices in + racially homogenous and heterogeneous U.S. history classrooms + + Journal of Social Studies Research + + bibtex + @article{Massaro2018,title = {{"}Calling Out{"} in class: Degrees of + candor in addressing social injustices in racially homogenous and heterogeneous U.S. + history classrooms},journal = {Journal of Social Studies Research},year = {2018},author + = {Parkhouse, H. and Massaro, V.R.}} + + journal-article + + 2018 + + + + doi + 10.1016/j.jssr.2018.01.004 + 10.1016/j.jssr.2018.01.004 + self + + + eid + 2-s2.0-85041949043 + 2-s2.0-85041949043 + self + + + http://www.scopus.com/inward/record.url?eid=2-s2.0-85041949043&partnerID=MN8TOARS + + + Parkhouse, H. + + + Massaro, V.R. + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml new file mode 100644 index 000000000..650d5a4cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml @@ -0,0 +1,113 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Abdel-Dayem K + + first + author + + + + Abdel-Dayem Fake + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml new file mode 100644 index 000000000..83752b145 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml @@ -0,0 +1,106 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + Khair Abde Daye + + first + author + + + + Eweda II + + first + author + + + + El-Sherbiny A + + first + author + + + + Dimitry MO + + first + author + + + + Nammas W + + first + author + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml new file mode 100644 index 000000000..26e64aeda --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml @@ -0,0 +1,101 @@ + + + 2016-12-12T23:02:05.233Z + 2016-12-13T09:08:16.412Z + + + https://orcid.org/0000-0002-9157-3431 + 0000-0002-9157-3431 + orcid.org + + Europe PubMed Central + + + Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which + Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for + ST-Segment-Elevation Myocardial Infarction. + + + formatted-unspecified + Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta + Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016 + + journal-article + + 2016 + 11 + + + + pmid + 27899851 + 27899851 + self + + + pmc + PMC5126442 + PMC5126442 + self + + + http://europepmc.org/abstract/med/27899851 + + + + seq0 + role0 + + + + creditname1 + + + creditname2 + + seq2 + + + + + creditname3 + + + role3 + + + + + + seq4 + role4 + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment.xml new file mode 100644 index 000000000..89c7f7020 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment.xml @@ -0,0 +1,50 @@ + + + 2016-09-01T19:21:05.791Z + 2016-09-01T19:21:05.791Z + + + https://orcid.org/0000-0001-5010-5001 + 0000-0001-5010-5001 + orcid.org + + Quang Nguyen + + + Beth Israel Deaconess Medical Center + + Boston + MA + US + + + 1859 + RINGGOLD + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_2.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_2.xml new file mode 100644 index 000000000..c0e88e236 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_2.xml @@ -0,0 +1,55 @@ + + + 2018-09-03T01:46:19.474Z + 2018-09-03T01:46:19.474Z + + + https://orcid.org/0000-0001-5011-3001 + 0000-0001-5011-3001 + orcid.org + + zhengyan li + + + 2008 + 09 + 01 + + + Anhui Academy of Agricultural Sciences + + Hefei + Anhui + CN + + + 125385 + RINGGOLD + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_3.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_3.xml new file mode 100644 index 000000000..8e7857fb8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/employment_3.xml @@ -0,0 +1,62 @@ + + + 2021-03-11T14:48:29.603Z + 2021-03-11T14:48:29.603Z + + + https://orcid.org/0000-0001-5012-1001 + 0000-0001-5012-1001 + orcid.org + + Asma Bazzi + + Pathology and Laboratory Medicine + Medical Laboratory Technologist + + 1994 + 10 + 01 + + + 2000 + 06 + 30 + + + American University of Beirut + + Hamra + Beirut + LB + + + 11238 + RINGGOLD + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/summary.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/summary.xml new file mode 100644 index 000000000..1f5a1bff1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/orcid/summary.xml @@ -0,0 +1,581 @@ + + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + + es + + + Direct + 2023-01-17T23:50:40.215Z + 2023-09-04T17:51:57.749Z + true + true + true + + + + 2023-01-17T23:50:40.472Z + 2023-01-17T23:50:40.472Z + Patricio + Sánchez Quinchuela + + + + 2023-01-19T13:47:33.653Z + 2023-01-19T13:47:33.653Z + Especialista de vinculación con la sociedad y docente de la Universidad de las Artes. Magister en Economía Social y Solidaria por el IAEN; Magister en Proyectos Sociales y Productivos por la UNACH. Licenciado en Artes UCE. Licenciado en Castellano y Literatura por la UNACH. Doctorando del programa de Sociología de la UNED España. Larga trayectoria vinculado a las organizaciones sociales acompañando procesos de gestión cultural, formación de liderazgos y economía solidaria. + + + + + + + 2018-02-05T23:27:36.636Z + + 2013-03-08T03:20:39.347Z + 2018-02-05T23:27:36.636Z + + + https://orcid.org/client/0000-0002-5982-8983 + 0000-0002-5982-8983 + orcid.org + + Scopus - Elsevier + + https://orcid.org/0000-0001-7291-3210 + 0000-0001-7291-3210 + orcid.org + + Paolo Manghi + + Scopus Author ID + 6602255248 + http://www.scopus.com/inward/authorDetails.url?authorID=6602255248&partnerID=MN8TOARS + self + + + + + 2023-09-04T17:51:57.749Z + + 2023-01-19T13:49:48.482Z + + 2023-01-19T13:49:48.482Z + + + 2023-01-19T13:49:48.482Z + 2023-01-19T13:49:48.482Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Programa de Maestría + Becario del programa de Maestría en Economía Social y Solidaria + + 2014 + 10 + 20 + + + Instituto de Altos Estudios Nacionales + + Quito + Pichincha + EC + + + https://ror.org/011g3me54 + ROR + + + + + + + 2023-01-18T21:41:03.175Z + + 2023-01-18T21:41:03.175Z + + + 2023-01-18T21:41:03.175Z + 2023-01-18T21:41:03.175Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Programa de Doctorado en Sociología + Doctorando del Programa de Sociología + + 2020 + 11 + 06 + + + Universidad Nacional de Educación a Distancia Facultad de Ciencias Políticas y Sociología + + Madrid + Comunidad de Madrid + ES + + + 223339 + RINGGOLD + + + + + + + 2023-01-18T21:25:07.138Z + + 2023-01-18T21:22:21.513Z + + + 2023-01-17T23:57:08.246Z + 2023-01-18T21:22:21.513Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Dirección de Vinculación con la Sociedad + Especialista de Proyectos y docente + + 2021 + 11 + 01 + + + Universidad de las Artes + + Guayaquil + Guayas + EC + + + https://ror.org/016drwn73 + ROR + + + + + + 2023-01-18T21:25:07.138Z + + + 2023-01-18T21:25:07.138Z + 2023-01-18T21:25:07.138Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Dirección de Vinculación con la Sociedad + Director + + 2019 + 11 + 05 + + + 2021 + 10 + 31 + + + Universidad Regional Amazónica IKIAM + + Tena + Napo + EC + + + https://ror.org/05xedqd83 + ROR + + + http://ikiam.edu.ec + + + + + + + 2023-03-24T18:16:09.131Z + + 2023-03-24T18:16:09.131Z + + + 2023-03-24T18:16:09.131Z + 2023-03-24T18:16:09.131Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Artes Escénicas + Miembro + + 2000 + 07 + 15 + + + Casa de la Cultura Ecuatoriana + + Riobamba + Sierra Centro + EC + + + + + + + + 2023-01-18T21:45:07.379Z + + 2023-01-18T21:29:11.300Z + + + 2023-01-18T21:29:11.300Z + 2023-01-18T21:29:11.300Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Programa de Gobernabilidad + Magister en Economïa Social y Solidaria + + 2014 + 10 + 20 + + + 2017 + 01 + 26 + + + Instituto de Altos Estudios Nacionales + + Quito + Pichincha + EC + + + https://ror.org/011g3me54 + ROR + + + + + + 2023-01-18T21:34:32.093Z + + + 2023-01-18T21:34:32.093Z + 2023-01-18T21:34:32.093Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Posgrados + Magister en Proyectos Sociales y Productivos + + 2001 + 03 + 09 + + + 2003 + 02 + 27 + + + Universidad Nacional de Chimborazo + + Riobamba + Chimborazo + EC + + + https://ror.org/059wmd288 + ROR + + + + + + 2023-01-18T21:45:07.379Z + + + 2023-01-18T21:45:07.379Z + 2023-01-18T21:45:07.379Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Ciencias de la Educación + Licenciado en Ciencias de la Educación en Castellano y Literatura + + 1994 + 10 + 03 + + + 2000 + 01 + 31 + + + Universidad Nacional de Chimborazo + + Riobamba + Chimborazo + EC + + + https://ror.org/059wmd288 + ROR + + + + + + 2023-01-18T21:37:42.186Z + + + 2023-01-18T21:37:42.186Z + 2023-01-18T21:37:42.186Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + Facultad de Artes + Licenciado en Artes + + 1989 + 09 + 05 + + + 1997 + 08 + 07 + + + Universidad Central del Ecuador + + Quito + Pichincha + EC + + + http://dx.doi.org/10.13039/100019134 + FUNDREF + + + + + + + + + 2023-09-04T17:51:57.749Z + + 2023-06-09T22:15:12.910Z + + + 2023-03-24T18:36:56.180Z + 2023-06-09T22:15:12.910Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + + Experience in a non-capitalist way: solidarity funds that do not tax interest on the use of money + + + + isbn + 978-9942-29-089-2 + 9789942290892 + part-of + + + book-chapter + + 2023 + 06 + 07 + + Finanzas éticas y solidarias en América Latina: diagnósticos, debates y propuestas + + + + 2023-03-24T19:05:36.384Z + + + 2023-03-24T19:05:36.384Z + 2023-03-24T19:05:36.384Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + + Incidence of artistic practices in the social transformation of the territory. study of case: Hilarte Association, Guayaquil-Ecuador + + + conference-abstract + + 2022 + 10 + 06 + + + + + 2023-09-04T17:40:30.215Z + + + other-id + 2018 + 2018 + self + + + + 2023-09-04T17:40:30.215Z + 2023-09-04T17:40:30.215Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + + Más allá de la transferencia de conocimientos, un espacio para el interaprendizaje y el diálogo de saberes + + + + other-id + 2018 + 2018 + self + + + https://drive.google.com/drive/folders/1Tclz6isxGzSjTq-hfTnxe6M1nux-88wF?usp=drive_link + conference-poster + + 2018 + 11 + 30 + + Más allá de la transferencia de conocimientos, un espacio para el interaprendizaje y el diálogo de saberes + + + + 2023-03-24T18:57:10.095Z + + + 2023-03-24T18:57:10.095Z + 2023-03-24T18:57:10.095Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + + Promotion of the popular and solidarity economy from the state: principles and challenges in the experience of Ecuador + + + dissertation-thesis + + 2017 + 01 + 26 + + + + + 2023-09-04T17:51:57.749Z + + + 2023-09-04T17:51:57.749Z + 2023-09-04T17:51:57.749Z + + + https://orcid.org/0000-0001-5045-1000 + 0000-0001-5045-1000 + orcid.org + + Patricio Sánchez Quinchuela + + + La Rebelión de los Dioses + + + https://drive.google.com/drive/folders/1Tclz6isxGzSjTq-hfTnxe6M1nux-88wF?usp=drive_link + registered-copyright + + 2001 + 08 + 28 + + Editorial pedagógica freire + + + + + \ No newline at end of file