From 7d759947ae06430f8732fd86cec562fa1edfa077 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 22 Apr 2020 14:41:19 +0200 Subject: [PATCH] used vtd for parsing orcid xml record, set 4g heapspace --- .../doiboost/orcid/OrcidDSManager.java | 97 ++++---- .../doiboost/orcid/SummariesDecompressor.java | 234 ++++++++++-------- .../doiboost/orcid/xml/XMLRecordParser.java | 165 ++++++------ .../orcid/oozie_app/config-default.xml | 4 + .../dhp/doiboost/orcid/oozie_app/workflow.xml | 20 +- .../orcid/xml/XMLRecordParserTest.java | 40 +++ .../orcid/xml/summary_0000-0001-6828-479X.xml | 71 ++++++ .../doiboost/orcid/xml/summary_error.xml | 33 +++ 8 files changed, 408 insertions(+), 256 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index e350877a9..a55ed0b65 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -1,75 +1,78 @@ package eu.dnetlib.doiboost.orcid; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.IOException; import java.net.URI; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import org.mortbay.log.Log; public class OrcidDSManager { - private static final Logger logger = LoggerFactory.getLogger(OrcidDSManager.class); - - private String hdfsServerUri; + private String hdfsServerUri; private String hdfsOrcidDefaultPath; private String summariesFileNameTarGz; private String outputAuthorsPath; - + public static void main(String[] args) throws IOException, Exception { - logger.info("OrcidDSManager started"); - OrcidDSManager orcidDSManager = new OrcidDSManager(); - orcidDSManager.loadArgs(args); - orcidDSManager.generateAuthors(); + OrcidDSManager orcidDSManager = new OrcidDSManager(); + orcidDSManager.loadArgs(args); + orcidDSManager.generateAuthors(); } public void generateAuthors() throws Exception { - Configuration conf = initConfigurationObject(); - FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); - logger.info("Started parsing "+tarGzUri); - Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors.seq")); - SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); + Path outputPath = + new Path( + hdfsServerUri + .concat(hdfsOrcidDefaultPath) + .concat(outputAuthorsPath) + .concat("authors.seq")); + SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); } - + private Configuration initConfigurationObject() { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - return conf; + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + return conf; } - + private FileSystem initFileSystemObject(Configuration conf) { - //Get the filesystem - HDFS - FileSystem fs = null; - try { - fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return fs; + // Get the filesystem - HDFS + FileSystem fs = null; + try { + fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return fs; } - + private void loadArgs(String[] args) throws IOException, Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(OrcidDSManager.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + OrcidDSManager.class.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); parser.parseArgument(args); - final String hdfsServerUri = parser.get("hdfsServerUri"); - logger.info("HDFS URI: "+hdfsServerUri); - Path hdfsOrcidDefaultPath = new Path(parser.get("hdfsOrcidDefaultPath")); - logger.info("Default Path: "+hdfsOrcidDefaultPath); - final String summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); - logger.info("Summaries File Name: "+summariesFileNameTarGz); - final String outputAuthorsPath = parser.get("summariesFileNameTarGz"); - logger.info("Output Authors Data: "+outputAuthorsPath); + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); + Log.info("Default Path: " + hdfsOrcidDefaultPath); + summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + Log.info("Summaries File Name: " + summariesFileNameTarGz); + outputAuthorsPath = parser.get("outputAuthorsPath"); + Log.info("Output Authors Data: " + outputAuthorsPath); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index 2a8d6d6de..a5343a557 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -3,6 +3,11 @@ package eu.dnetlib.doiboost.orcid; import eu.dnetlib.doiboost.orcid.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.hadoop.conf.Configuration; @@ -13,23 +18,14 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; -import org.apache.log4j.Logger; -import org.xml.sax.SAXException; - -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URI; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPathExpressionException; +import org.mortbay.log.Log; public class SummariesDecompressor { - - private static final Logger logger = Logger.getLogger(SummariesDecompressor.class); - - public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) throws Exception { + + private static final int MAX_XML_RECORDS_PARSED = -1; + + public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) + throws Exception { String uri = inputUri; FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); @@ -42,101 +38,123 @@ public class SummariesDecompressor { CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream gzipInputStream = null; try { - gzipInputStream = codec.createInputStream(fs.open(inputPath)); - parseTarSummaries(fs, conf, gzipInputStream, outputPath); - + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarSummaries(fs, conf, gzipInputStream, outputPath); + } finally { - logger.debug("Closing gzip stream"); + Log.debug("Closing gzip stream"); IOUtils.closeStream(gzipInputStream); } } - - private static void parseTarSummaries(FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { - int counter = 0; - int nameFound = 0; - int surnameFound = 0; - int creditNameFound = 0; - int errorFromOrcidFound = 0; - int xmlParserErrorFound = 0; - try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { - TarArchiveEntry entry = null; - - try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(outputPath), SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class))) { - - while ((entry = tais.getNextTarEntry()) != null) { - String filename = entry.getName(); - if (entry.isDirectory()) { - logger.debug("Directory entry name: "+entry.getName()); - } else { - logger.debug("XML record entry name: "+entry.getName()); - counter++; - BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput - String line; - StringBuffer buffer = new StringBuffer(); - while ((line = br.readLine()) != null) { - buffer.append(line); - } - try (ByteArrayInputStream bais = new ByteArrayInputStream(buffer.toString().getBytes())) { - AuthorData authorData = XMLRecordParser.parse(bais); - if (authorData!=null) { - if (authorData.getErrorCode()!=null) { - errorFromOrcidFound+=1; - logger.debug("error from Orcid with code "+authorData.getErrorCode()+" for oid "+entry.getName()); - continue; - } - String jsonData = JsonWriter.create(authorData); - logger.debug("oid: "+authorData.getOid() + " data: "+jsonData); - - final Text key = new Text(authorData.getOid()); - final Text value = new Text(jsonData); - - try { - writer.append(key, value); - } catch (IOException e) { - logger.error("Writing to sequence file: "+e.getMessage()); - e.printStackTrace(); - throw new RuntimeException(e); - } - - if (authorData.getName()!=null) { - nameFound+=1; - } - if (authorData.getSurname()!=null) { - surnameFound+=1; - } - if (authorData.getCreditName()!=null) { - creditNameFound+=1; - } - - } - else { - logger.error("Data not retrievable ["+entry.getName()+"] "+buffer.toString()); - xmlParserErrorFound+=1; - } - - } catch (XPathExpressionException | ParserConfigurationException | SAXException e) { - logger.error("Parsing record from tar archive: "+e.getMessage()); - e.printStackTrace(); - } - } - - if ((counter % 100000) == 0) { - logger.info("Current xml records parsed: "+counter); - } - } - } - } catch (IOException e) { - logger.error("Parsing record from gzip archive: "+e.getMessage()); - throw new RuntimeException(e); - } - logger.info("Summaries parse completed"); - logger.info("Total XML records parsed: "+counter); - logger.info("Name found: "+nameFound); - logger.info("Surname found: "+surnameFound); - logger.info("Credit name found: "+creditNameFound); - logger.info("Error from Orcid found: "+errorFromOrcidFound); - logger.info("Error parsing xml record found: "+xmlParserErrorFound); + + private static void parseTarSummaries( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int nameFound = 0; + int surnameFound = 0; + int creditNameFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + try { + if (entry.isDirectory()) { + Log.debug("Directory entry name: " + entry.getName()); + } else { + Log.debug("XML record entry name: " + entry.getName()); + counter++; + BufferedReader br = + new BufferedReader( + new InputStreamReader( + tais)); // Read directly from tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + AuthorData authorData = + XMLRecordParser.VTDParse(buffer.toString().getBytes()); + if (authorData != null) { + if (authorData.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log.debug( + "error from Orcid with code " + + authorData.getErrorCode() + + " for oid " + + entry.getName()); + continue; + } + String jsonData = JsonWriter.create(authorData); + Log.debug("oid: " + authorData.getOid() + " data: " + jsonData); + + final Text key = new Text(authorData.getOid()); + final Text value = new Text(jsonData); + + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } + + if (authorData.getName() != null) { + nameFound += 1; + } + if (authorData.getSurname() != null) { + surnameFound += 1; + } + if (authorData.getCreditName() != null) { + creditNameFound += 1; + } + + } else { + Log.warn( + "Data not retrievable [" + + entry.getName() + + "] " + + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log.warn( + "Parsing record from tar archive and xml record: " + + filename + + " " + + e.getMessage()); + Log.warn(e); + } + + if ((counter % 100000) == 0) { + Log.info("Current xml records parsed: " + counter); + } + + if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing record from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Summaries parse completed"); + Log.info("Total XML records parsed: " + counter); + Log.info("Name found: " + nameFound); + Log.info("Surname found: " + surnameFound); + Log.info("Credit name found: " + creditNameFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml record found: " + xmlParserErrorFound); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index 54be2f316..bdaba8202 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -1,98 +1,81 @@ package eu.dnetlib.doiboost.orcid.xml; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.Iterator; - -import javax.xml.namespace.NamespaceContext; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; - +import com.ximpleware.AutoPilot; +import com.ximpleware.EOFException; +import com.ximpleware.EncodingException; +import com.ximpleware.EntityException; +import com.ximpleware.ParseException; +import com.ximpleware.VTDGen; +import com.ximpleware.VTDNav; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.doiboost.orcid.model.AuthorData; -import org.apache.commons.lang.StringUtils; -import org.w3c.dom.Document; -import org.xml.sax.SAXException; - +import java.util.Arrays; +import java.util.List; public class XMLRecordParser { - public static AuthorData parse(ByteArrayInputStream bytesStream) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException { - bytesStream.reset(); - DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); - builderFactory.setNamespaceAware(true); - DocumentBuilder builder = builderFactory.newDocumentBuilder(); - - Document xmlDocument = builder.parse(bytesStream); - XPath xPath = XPathFactory.newInstance().newXPath(); - xPath.setNamespaceContext(new NamespaceContext() { - @Override - public Iterator getPrefixes(String arg0) { - return null; - } - @Override - public String getPrefix(String arg0) { - return null; - } - @Override - public String getNamespaceURI(String arg0) { - if ("common".equals(arg0)) { - return "http://www.orcid.org/ns/common"; - } - else if ("person".equals(arg0)) { - return "http://www.orcid.org/ns/person"; - } - else if ("personal-details".equals(arg0)) { - return "http://www.orcid.org/ns/personal-details"; - } - else if ("other-name".equals(arg0)) { - return "http://www.orcid.org/ns/other-name"; - } - else if ("record".equals(arg0)) { - return "http://www.orcid.org/ns/record"; - } - else if ("error".equals(arg0)) { - return "http://www.orcid.org/ns/error"; - } - return null; - } - }); - - AuthorData authorData = new AuthorData(); - String errorPath = "//error:response-code"; - String error = (String)xPath.compile(errorPath).evaluate(xmlDocument, XPathConstants.STRING); - if (!StringUtils.isBlank(error)) { - authorData.setErrorCode(error); - return authorData; - } - String oidPath = "//record:record/@path"; - String oid = (String)xPath.compile(oidPath).evaluate(xmlDocument, XPathConstants.STRING); - if (!StringUtils.isBlank(oid)) { - oid = oid.substring(1); - authorData.setOid(oid); - } - else { - return null; - } - String namePath = "//personal-details:given-names"; - String name = (String)xPath.compile(namePath).evaluate(xmlDocument, XPathConstants.STRING); - if (!StringUtils.isBlank(name)) { - authorData.setName(name); - } - String surnamePath = "//personal-details:family-name"; - String surname = (String)xPath.compile(surnamePath).evaluate(xmlDocument, XPathConstants.STRING); - if (!StringUtils.isBlank(surname)) { - authorData.setSurname(surname); - } - String creditnamePath = "//personal-details:credit-name"; - String creditName = (String)xPath.compile(creditnamePath).evaluate(xmlDocument, XPathConstants.STRING); - if (!StringUtils.isBlank(creditName)) { - authorData.setCreditName(creditName); - } - return authorData; - } + private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; + private static final String NS_COMMON = "common"; + private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; + private static final String NS_PERSON = "person"; + private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; + private static final String NS_DETAILS = "personal-details"; + private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; + private static final String NS_OTHER = "other-name"; + private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; + private static final String NS_RECORD = "record"; + private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + private static final String NS_ERROR = "error"; + + public static AuthorData VTDParse(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL); + ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL); + ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); + ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + + AuthorData authorData = new AuthorData(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + authorData.setErrorCode(errors.get(0)); + return authorData; + } + + List recordNodes = + VtdUtilityParser.getTextValuesWithAttributes( + ap, vn, "//record:record", Arrays.asList("path")); + if (!recordNodes.isEmpty()) { + final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1); + authorData.setOid(oid); + } else { + return null; + } + + final List names = + VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names"); + if (!names.isEmpty()) { + authorData.setName(names.get(0)); + } + + final List surnames = + VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name"); + if (!surnames.isEmpty()) { + authorData.setSurname(surnames.get(0)); + } + + final List creditNames = + VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name"); + if (!creditNames.isEmpty()) { + authorData.setCreditName(creditNames.get(0)); + } + return authorData; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml index 9f009a781..5621415d9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml @@ -15,4 +15,8 @@ oozie.launcher.mapreduce.user.classpath.first true + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml index a52a56634..7a8d04187 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml @@ -1,18 +1,18 @@ - + workingPath the working dir base path - + - - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + @@ -21,9 +21,9 @@ - - - + + + ${jobTracker} @@ -31,8 +31,8 @@ eu.dnetlib.doiboost.orcid.OrcidDSManager -d${workingPath}/ -n${nameNode} - -fORCID_2019_summaries.tar.gz - -ooutput/ + -fORCID_2019_summaries.tar.gz + -ooutput/ diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java new file mode 100644 index 000000000..1d3323b61 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -0,0 +1,40 @@ +package eu.dnetlib.doiboost.orcid.xml; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +public class XMLRecordParserTest { + + @Test + public void testOrcidXMLRecordParser() throws Exception { + + String xml = + IOUtils.toString( + this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); + + XMLRecordParser p = new XMLRecordParser(); + + AuthorData authorData = p.VTDParse(xml.getBytes()); + assertNotNull(authorData); + assertNotNull(authorData.getName()); + System.out.println("name: " + authorData.getName()); + assertNotNull(authorData.getSurname()); + System.out.println("surname: " + authorData.getSurname()); + } + + @Test + public void testOrcidXMLErrorRecordParser() throws Exception { + + String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml")); + + XMLRecordParser p = new XMLRecordParser(); + + AuthorData authorData = p.VTDParse(xml.getBytes()); + assertNotNull(authorData); + assertNotNull(authorData.getErrorCode()); + System.out.println("error: " + authorData.getErrorCode()); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml new file mode 100644 index 000000000..559352751 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml @@ -0,0 +1,71 @@ + + + + https://orcid.org/0000-0001-6828-479X + 0000-0001-6828-479X + orcid.org + + + en + + + Member-referred + 2017-02-17T06:16:06.428Z + 2017-10-04T04:38:43.529Z + true + true + true + + + + 2017-02-17T06:16:06.428Z + 2017-02-17T06:16:06.652Z + Masahide + Terazima + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml new file mode 100644 index 000000000..a5eaff8a0 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml @@ -0,0 +1,33 @@ + + + 409 + 409 Conflict: The ORCID record is locked and cannot be edited. ORCID + https://orcid.org/0000-0002-9716-679X + The ORCID record is locked. + 9018 + https://members.orcid.org/api/resources/troubleshooting + \ No newline at end of file