From 7d759947ae06430f8732fd86cec562fa1edfa077 Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Wed, 22 Apr 2020 14:41:19 +0200
Subject: [PATCH] used vtd for parsing orcid xml record, set 4g heapspace

---
 .../doiboost/orcid/OrcidDSManager.java        |  97 ++++----
 .../doiboost/orcid/SummariesDecompressor.java | 234 ++++++++++--------
 .../doiboost/orcid/xml/XMLRecordParser.java   | 165 ++++++------
 .../orcid/oozie_app/config-default.xml        |   4 +
 .../dhp/doiboost/orcid/oozie_app/workflow.xml |  20 +-
 .../orcid/xml/XMLRecordParserTest.java        |  40 +++
 .../orcid/xml/summary_0000-0001-6828-479X.xml |  71 ++++++
 .../doiboost/orcid/xml/summary_error.xml      |  33 +++
 8 files changed, 408 insertions(+), 256 deletions(-)
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml
 create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
index e350877a9..a55ed0b65 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java
@@ -1,75 +1,78 @@
 package eu.dnetlib.doiboost.orcid;
 
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import java.io.IOException;
 import java.net.URI;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import org.mortbay.log.Log;
 
 public class OrcidDSManager {
 
-	private static final Logger logger = LoggerFactory.getLogger(OrcidDSManager.class);
-	
-	private String hdfsServerUri;
+    private String hdfsServerUri;
     private String hdfsOrcidDefaultPath;
     private String summariesFileNameTarGz;
     private String outputAuthorsPath;
-    
+
     public static void main(String[] args) throws IOException, Exception {
-    	logger.info("OrcidDSManager started");
-    	OrcidDSManager orcidDSManager = new OrcidDSManager();
-		orcidDSManager.loadArgs(args);
-		orcidDSManager.generateAuthors();
+        OrcidDSManager orcidDSManager = new OrcidDSManager();
+        orcidDSManager.loadArgs(args);
+        orcidDSManager.generateAuthors();
     }
 
     public void generateAuthors() throws Exception {
-    	Configuration conf = initConfigurationObject();
-    	FileSystem fs = initFileSystemObject(conf);
-    	String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
-    	logger.info("Started parsing "+tarGzUri);
-    	Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors.seq"));
-    	SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
+        Configuration conf = initConfigurationObject();
+        FileSystem fs = initFileSystemObject(conf);
+        String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
+        Path outputPath =
+                new Path(
+                        hdfsServerUri
+                                .concat(hdfsOrcidDefaultPath)
+                                .concat(outputAuthorsPath)
+                                .concat("authors.seq"));
+        SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
     }
-    
+
     private Configuration initConfigurationObject() {
-    	// ====== Init HDFS File System Object
-    	Configuration conf = new Configuration();
-    	// Set FileSystem URI
-    	conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
-    	// Because of Maven
-    	conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
-    	conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-    	return conf;
+        // ====== Init HDFS File System Object
+        Configuration conf = new Configuration();
+        // Set FileSystem URI
+        conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
+        // Because of Maven
+        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+        return conf;
     }
-    
+
     private FileSystem initFileSystemObject(Configuration conf) {
-    	//Get the filesystem - HDFS
-    	FileSystem fs = null;
-    	try {
-    		fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
-		} catch (IOException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		}
-    	return fs;
+        // Get the filesystem - HDFS
+        FileSystem fs = null;
+        try {
+            fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return fs;
     }
- 
+
     private void loadArgs(String[] args) throws IOException, Exception {
-    	final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(OrcidDSManager.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
+        final ArgumentApplicationParser parser =
+                new ArgumentApplicationParser(
+                        IOUtils.toString(
+                                OrcidDSManager.class.getResourceAsStream(
+                                        "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
         parser.parseArgument(args);
 
-        final String hdfsServerUri = parser.get("hdfsServerUri");
-        logger.info("HDFS URI: "+hdfsServerUri);
-        Path hdfsOrcidDefaultPath = new Path(parser.get("hdfsOrcidDefaultPath"));
-        logger.info("Default Path: "+hdfsOrcidDefaultPath);
-        final String summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
-        logger.info("Summaries File Name: "+summariesFileNameTarGz);
-        final String outputAuthorsPath = parser.get("summariesFileNameTarGz");
-        logger.info("Output Authors Data: "+outputAuthorsPath);
+        hdfsServerUri = parser.get("hdfsServerUri");
+        Log.info("HDFS URI: " + hdfsServerUri);
+        hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
+        Log.info("Default Path: " + hdfsOrcidDefaultPath);
+        summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
+        Log.info("Summaries File Name: " + summariesFileNameTarGz);
+        outputAuthorsPath = parser.get("outputAuthorsPath");
+        Log.info("Output Authors Data: " + outputAuthorsPath);
     }
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
index 2a8d6d6de..a5343a557 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@@ -3,6 +3,11 @@ package eu.dnetlib.doiboost.orcid;
 import eu.dnetlib.doiboost.orcid.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.hadoop.conf.Configuration;
@@ -13,23 +18,14 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
-import org.apache.log4j.Logger;
-import org.xml.sax.SAXException;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URI;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.xpath.XPathExpressionException;
+import org.mortbay.log.Log;
 
 public class SummariesDecompressor {
-	
-	private static final Logger logger = Logger.getLogger(SummariesDecompressor.class);
-    
-    public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) throws Exception {
+
+    private static final int MAX_XML_RECORDS_PARSED = -1;
+
+    public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath)
+            throws Exception {
         String uri = inputUri;
         FileSystem fs = FileSystem.get(URI.create(uri), conf);
         Path inputPath = new Path(uri);
@@ -42,101 +38,123 @@ public class SummariesDecompressor {
         CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
         InputStream gzipInputStream = null;
         try {
-        	gzipInputStream = codec.createInputStream(fs.open(inputPath));
-        	parseTarSummaries(fs, conf, gzipInputStream, outputPath);
-            
+            gzipInputStream = codec.createInputStream(fs.open(inputPath));
+            parseTarSummaries(fs, conf, gzipInputStream, outputPath);
+
         } finally {
-        	logger.debug("Closing gzip stream");
+            Log.debug("Closing gzip stream");
             IOUtils.closeStream(gzipInputStream);
         }
     }
-    
-    private static void parseTarSummaries(FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
-    	int counter = 0;
-    	int nameFound = 0;
-    	int surnameFound = 0;
-    	int creditNameFound = 0;
-    	int errorFromOrcidFound = 0;
-    	int xmlParserErrorFound = 0;
-    	try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
-    		TarArchiveEntry entry = null;
-    		
-    		try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
-	                SequenceFile.Writer.file(outputPath), SequenceFile.Writer.keyClass(Text.class),
-	                SequenceFile.Writer.valueClass(Text.class))) {
-    			
-			    while ((entry = tais.getNextTarEntry()) != null) {
-			        String filename = entry.getName();
-			        if (entry.isDirectory()) {
-			        	logger.debug("Directory entry name: "+entry.getName());
-			        } else {
-			        	logger.debug("XML record entry name: "+entry.getName());
-			        	counter++;
-			        	BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
-			            String line;
-			            StringBuffer buffer = new StringBuffer();
-			            while ((line = br.readLine()) != null) {
-			                buffer.append(line);
-			            }
-			        	try (ByteArrayInputStream bais = new ByteArrayInputStream(buffer.toString().getBytes())) {
-							AuthorData authorData = XMLRecordParser.parse(bais);
-							if (authorData!=null) {
-								if (authorData.getErrorCode()!=null) {
-									errorFromOrcidFound+=1;
-									logger.debug("error from Orcid with code "+authorData.getErrorCode()+" for oid "+entry.getName());
-									continue;
-								}
-								String jsonData = JsonWriter.create(authorData);
-								logger.debug("oid: "+authorData.getOid() + " data: "+jsonData);
-								
-					            final Text key = new Text(authorData.getOid());
-					            final Text value = new Text(jsonData);
-					            
-					            try {
-	                                writer.append(key, value);
-	                            } catch (IOException e) {
-	                            	logger.error("Writing to sequence file: "+e.getMessage());
-	                            	e.printStackTrace();
-	                                throw new RuntimeException(e);
-	                            }
-						          
-					            if (authorData.getName()!=null) {
-					            	nameFound+=1;
-					            }
-					            if (authorData.getSurname()!=null) {
-					            	surnameFound+=1;
-					            }
-					            if (authorData.getCreditName()!=null) {
-					            	creditNameFound+=1;
-					            }
-					            
-						        }
-							else {
-								logger.error("Data not retrievable ["+entry.getName()+"] "+buffer.toString());
-								xmlParserErrorFound+=1;
-							}
-								
-						} catch (XPathExpressionException | ParserConfigurationException | SAXException e) {
-							logger.error("Parsing record from tar archive: "+e.getMessage());
-							e.printStackTrace();
-						}
-			        }
-			        
-			        if ((counter % 100000) == 0) {
-			        	logger.info("Current xml records parsed: "+counter);
-			        }
-			    }
-    		}
-		} catch (IOException e) {
-			logger.error("Parsing record from gzip archive: "+e.getMessage());
-			throw new RuntimeException(e);
-		}
-    	logger.info("Summaries parse completed");
-    	logger.info("Total XML records parsed: "+counter);
-    	logger.info("Name found: "+nameFound);
-    	logger.info("Surname found: "+surnameFound);
-    	logger.info("Credit name found: "+creditNameFound);
-    	logger.info("Error from Orcid found: "+errorFromOrcidFound);
-    	logger.info("Error parsing xml record found: "+xmlParserErrorFound);
+
+    private static void parseTarSummaries(
+            FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
+        int counter = 0;
+        int nameFound = 0;
+        int surnameFound = 0;
+        int creditNameFound = 0;
+        int errorFromOrcidFound = 0;
+        int xmlParserErrorFound = 0;
+        try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+            TarArchiveEntry entry = null;
+
+            try (SequenceFile.Writer writer =
+                    SequenceFile.createWriter(
+                            conf,
+                            SequenceFile.Writer.file(outputPath),
+                            SequenceFile.Writer.keyClass(Text.class),
+                            SequenceFile.Writer.valueClass(Text.class))) {
+                while ((entry = tais.getNextTarEntry()) != null) {
+                    String filename = entry.getName();
+                    try {
+                        if (entry.isDirectory()) {
+                            Log.debug("Directory entry name: " + entry.getName());
+                        } else {
+                            Log.debug("XML record entry name: " + entry.getName());
+                            counter++;
+                            BufferedReader br =
+                                    new BufferedReader(
+                                            new InputStreamReader(
+                                                    tais)); // Read directly from tarInput
+                            String line;
+                            StringBuffer buffer = new StringBuffer();
+                            while ((line = br.readLine()) != null) {
+                                buffer.append(line);
+                            }
+                            AuthorData authorData =
+                                    XMLRecordParser.VTDParse(buffer.toString().getBytes());
+                            if (authorData != null) {
+                                if (authorData.getErrorCode() != null) {
+                                    errorFromOrcidFound += 1;
+                                    Log.debug(
+                                            "error from Orcid with code "
+                                                    + authorData.getErrorCode()
+                                                    + " for oid "
+                                                    + entry.getName());
+                                    continue;
+                                }
+                                String jsonData = JsonWriter.create(authorData);
+                                Log.debug("oid: " + authorData.getOid() + " data: " + jsonData);
+
+                                final Text key = new Text(authorData.getOid());
+                                final Text value = new Text(jsonData);
+
+                                try {
+                                    writer.append(key, value);
+                                } catch (IOException e) {
+                                    Log.debug("Writing to sequence file: " + e.getMessage());
+                                    Log.debug(e);
+                                    throw new RuntimeException(e);
+                                }
+
+                                if (authorData.getName() != null) {
+                                    nameFound += 1;
+                                }
+                                if (authorData.getSurname() != null) {
+                                    surnameFound += 1;
+                                }
+                                if (authorData.getCreditName() != null) {
+                                    creditNameFound += 1;
+                                }
+
+                            } else {
+                                Log.warn(
+                                        "Data not retrievable ["
+                                                + entry.getName()
+                                                + "] "
+                                                + buffer.toString());
+                                xmlParserErrorFound += 1;
+                            }
+                        }
+                    } catch (Exception e) {
+                        Log.warn(
+                                "Parsing record from tar archive and xml record: "
+                                        + filename
+                                        + "  "
+                                        + e.getMessage());
+                        Log.warn(e);
+                    }
+
+                    if ((counter % 100000) == 0) {
+                        Log.info("Current xml records parsed: " + counter);
+                    }
+
+                    if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) {
+                        break;
+                    }
+                }
+            }
+        } catch (IOException e) {
+            Log.warn("Parsing record from gzip archive: " + e.getMessage());
+            Log.warn(e);
+            throw new RuntimeException(e);
+        }
+        Log.info("Summaries parse completed");
+        Log.info("Total XML records parsed: " + counter);
+        Log.info("Name found: " + nameFound);
+        Log.info("Surname found: " + surnameFound);
+        Log.info("Credit name found: " + creditNameFound);
+        Log.info("Error from Orcid found: " + errorFromOrcidFound);
+        Log.info("Error parsing xml record found: " + xmlParserErrorFound);
     }
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
index 54be2f316..bdaba8202 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java
@@ -1,98 +1,81 @@
 package eu.dnetlib.doiboost.orcid.xml;
 
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.util.Iterator;
-
-import javax.xml.namespace.NamespaceContext;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.xpath.XPath;
-import javax.xml.xpath.XPathConstants;
-import javax.xml.xpath.XPathExpressionException;
-import javax.xml.xpath.XPathFactory;
-
+import com.ximpleware.AutoPilot;
+import com.ximpleware.EOFException;
+import com.ximpleware.EncodingException;
+import com.ximpleware.EntityException;
+import com.ximpleware.ParseException;
+import com.ximpleware.VTDGen;
+import com.ximpleware.VTDNav;
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import org.apache.commons.lang.StringUtils;
-import org.w3c.dom.Document;
-import org.xml.sax.SAXException;
-
+import java.util.Arrays;
+import java.util.List;
 
 public class XMLRecordParser {
 
-	public static AuthorData parse(ByteArrayInputStream bytesStream) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException {
-		bytesStream.reset();
-		DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
-		builderFactory.setNamespaceAware(true);
-		DocumentBuilder builder = builderFactory.newDocumentBuilder();
-		
-		Document xmlDocument = builder.parse(bytesStream);
-		XPath xPath = XPathFactory.newInstance().newXPath();
-		xPath.setNamespaceContext(new NamespaceContext() {
-		    @Override
-		    public Iterator getPrefixes(String arg0) {
-		        return null;
-		    }
-		    @Override
-		    public String getPrefix(String arg0) {
-		        return null;
-		    }
-		    @Override
-		    public String getNamespaceURI(String arg0) {
-		        if ("common".equals(arg0)) {
-		            return "http://www.orcid.org/ns/common";
-		        }
-		        else if ("person".equals(arg0)) {
-		            return "http://www.orcid.org/ns/person";
-		        }
-		        else if ("personal-details".equals(arg0)) {
-		            return "http://www.orcid.org/ns/personal-details";
-		        }
-		        else if ("other-name".equals(arg0)) {
-		            return "http://www.orcid.org/ns/other-name";
-		        }
-		        else if ("record".equals(arg0)) {
-		            return "http://www.orcid.org/ns/record";
-		        }
-		        else if ("error".equals(arg0)) {
-		            return "http://www.orcid.org/ns/error";
-		        }
-		        return null;
-		    }
-		});
-		
-		AuthorData authorData = new AuthorData();
-		String errorPath = "//error:response-code";
-		String error = (String)xPath.compile(errorPath).evaluate(xmlDocument, XPathConstants.STRING);
-		if (!StringUtils.isBlank(error)) {
-			authorData.setErrorCode(error);
-			return authorData;
-		}
-		String oidPath = "//record:record/@path";
-		String oid = (String)xPath.compile(oidPath).evaluate(xmlDocument, XPathConstants.STRING);
-		if (!StringUtils.isBlank(oid)) {
-			oid = oid.substring(1);
-			authorData.setOid(oid);
-		}
-		else {
-			return null;
-		}
-		String namePath = "//personal-details:given-names";
-		String name = (String)xPath.compile(namePath).evaluate(xmlDocument, XPathConstants.STRING);
-		if (!StringUtils.isBlank(name)) {
-			authorData.setName(name);
-		}
-		String surnamePath = "//personal-details:family-name";
-		String surname = (String)xPath.compile(surnamePath).evaluate(xmlDocument, XPathConstants.STRING);
-		if (!StringUtils.isBlank(surname)) {
-			authorData.setSurname(surname);
-		}
-		String creditnamePath = "//personal-details:credit-name";
-		String creditName = (String)xPath.compile(creditnamePath).evaluate(xmlDocument, XPathConstants.STRING);
-		if (!StringUtils.isBlank(creditName)) {
-			authorData.setCreditName(creditName);
-		}
-		return authorData;
-	}
+    private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
+    private static final String NS_COMMON = "common";
+    private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
+    private static final String NS_PERSON = "person";
+    private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
+    private static final String NS_DETAILS = "personal-details";
+    private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
+    private static final String NS_OTHER = "other-name";
+    private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
+    private static final String NS_RECORD = "record";
+    private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
+    private static final String NS_ERROR = "error";
+
+    public static AuthorData VTDParse(byte[] bytes)
+            throws VtdException, EncodingException, EOFException, EntityException, ParseException {
+        final VTDGen vg = new VTDGen();
+        vg.setDoc(bytes);
+        vg.parse(true);
+        final VTDNav vn = vg.getNav();
+        final AutoPilot ap = new AutoPilot(vn);
+        ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
+        ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
+        ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
+        ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
+        ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
+        ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
+
+        AuthorData authorData = new AuthorData();
+        final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
+        if (!errors.isEmpty()) {
+            authorData.setErrorCode(errors.get(0));
+            return authorData;
+        }
+
+        List<VtdUtilityParser.Node> recordNodes =
+                VtdUtilityParser.getTextValuesWithAttributes(
+                        ap, vn, "//record:record", Arrays.asList("path"));
+        if (!recordNodes.isEmpty()) {
+            final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
+            authorData.setOid(oid);
+        } else {
+            return null;
+        }
+
+        final List<String> names =
+                VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
+        if (!names.isEmpty()) {
+            authorData.setName(names.get(0));
+        }
+
+        final List<String> surnames =
+                VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
+        if (!surnames.isEmpty()) {
+            authorData.setSurname(surnames.get(0));
+        }
+
+        final List<String> creditNames =
+                VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
+        if (!creditNames.isEmpty()) {
+            authorData.setCreditName(creditNames.get(0));
+        }
+        return authorData;
+    }
 }
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
index 9f009a781..5621415d9 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/config-default.xml
@@ -15,4 +15,8 @@
         <name>oozie.launcher.mapreduce.user.classpath.first</name>
         <value>true</value>
     </property>
+    <property>
+      <name>oozie.launcher.mapreduce.map.java.opts</name>
+      <value>-Xmx4g</value>
+    </property>
 </configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
index a52a56634..7a8d04187 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml
@@ -1,18 +1,18 @@
-<workflow-app name="import Crossref from index into HDFS" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
             <name>workingPath</name>
             <description>the working dir base path</description>
         </property>
     </parameters>
-
+    
     <start to="ResetWorkingPath"/>
-
-
+    
+    
     <kill name="Kill">
         <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
     </kill>
-
+    
     <action name="ResetWorkingPath">
         <fs>
             <delete path='${workingPath}/output'/>
@@ -21,9 +21,9 @@
         <ok to="ImportOrcidSummary"/>
         <error to="Kill"/>
     </action>
-
-
-
+    
+    
+    
     <action name="ImportOrcidSummary">
         <java>
             <job-tracker>${jobTracker}</job-tracker>
@@ -31,8 +31,8 @@
             <main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
             <arg>-d</arg><arg>${workingPath}/</arg>
             <arg>-n</arg><arg>${nameNode}</arg>
-			<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
-			<arg>-o</arg><arg>output/</arg>
+            <arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
+            <arg>-o</arg><arg>output/</arg>
         </java>
         <ok to="End"/>
         <error to="Kill"/>
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
new file mode 100644
index 000000000..1d3323b61
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java
@@ -0,0 +1,40 @@
+package eu.dnetlib.doiboost.orcid.xml;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+
+public class XMLRecordParserTest {
+
+    @Test
+    public void testOrcidXMLRecordParser() throws Exception {
+
+        String xml =
+                IOUtils.toString(
+                        this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
+
+        XMLRecordParser p = new XMLRecordParser();
+
+        AuthorData authorData = p.VTDParse(xml.getBytes());
+        assertNotNull(authorData);
+        assertNotNull(authorData.getName());
+        System.out.println("name: " + authorData.getName());
+        assertNotNull(authorData.getSurname());
+        System.out.println("surname: " + authorData.getSurname());
+    }
+
+    @Test
+    public void testOrcidXMLErrorRecordParser() throws Exception {
+
+        String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
+
+        XMLRecordParser p = new XMLRecordParser();
+
+        AuthorData authorData = p.VTDParse(xml.getBytes());
+        assertNotNull(authorData);
+        assertNotNull(authorData.getErrorCode());
+        System.out.println("error: " + authorData.getErrorCode());
+    }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml
new file mode 100644
index 000000000..559352751
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_0000-0001-6828-479X.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<record:record xmlns:address="http://www.orcid.org/ns/address"
+    xmlns:email="http://www.orcid.org/ns/email
+    " xmlns:history="http://www.orcid.org/ns/history"
+    xmlns:employment="http://www.orcid.org/ns/employment"
+    xmlns:education="http://www.orcid.org/ns/education"
+    xmlns:other-name="http://www.orcid.org/ns/other-name"
+    xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+    xmlns:funding="http://www.orcid.org/ns/funding"
+    xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+    xmlns:service="http://www.orcid.org/ns/service"
+    xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+    xmlns:distinction="http://www.orcid.org/ns/distinction"
+    xmlns:internal="http://www.orcid.org/ns/internal"
+    xmlns:membership="http://www.orcid.org/ns/membership"
+    xmlns:person="http://www.orcid.org/ns/person"
+    xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+    xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+    xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+    xmlns:activities="http://www.orcid.org/ns/activities"
+    xmlns:qualification="http://www.orcid.org/ns/qualification"
+    xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+    xmlns:error="http://www.orcid.org/ns/error"
+    xmlns:preferences="http://www.orcid.org/ns/preferences"
+    xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+    xmlns:work="http://www.orcid.org/ns/work"
+    xmlns:peer-review="http://www.orcid.org/ns/peer-review" path="/0000-0001-6828-479X">
+    <common:orcid-identifier>
+        <common:uri>https://orcid.org/0000-0001-6828-479X</common:uri>
+        <common:path>0000-0001-6828-479X</common:path>
+        <common:host>orcid.org</common:host>
+    </common:orcid-identifier>
+    <preferences:preferences>
+        <preferences:locale>en</preferences:locale>
+    </preferences:preferences>
+    <history:history>
+        <history:creation-method>Member-referred</history:creation-method>
+        <history:submission-date>2017-02-17T06:16:06.428Z</history:submission-date>
+        <common:last-modified-date>2017-10-04T04:38:43.529Z</common:last-modified-date>
+        <history:claimed>true</history:claimed>
+        <history:verified-email>true</history:verified-email>
+        <history:verified-primary-email>true</history:verified-primary-email>
+    </history:history>
+    <person:person path="/0000-0001-6828-479X/person">
+        <person:name visibility="public" path="0000-0001-6828-479X">
+            <common:created-date>2017-02-17T06:16:06.428Z</common:created-date>
+            <common:last-modified-date>2017-02-17T06:16:06.652Z</common:last-modified-date>
+            <personal-details:given-names>Masahide</personal-details:given-names>
+            <personal-details:family-name>Terazima</personal-details:family-name>
+        </person:name>
+        <other-name:other-names path="/0000-0001-6828-479X/other-names"/>
+        <researcher-url:researcher-urls path="/0000-0001-6828-479X/researcher-urls"/>
+        <email:emails path="/0000-0001-6828-479X/email"/>
+        <address:addresses path="/0000-0001-6828-479X/address"/>
+        <keyword:keywords path="/0000-0001-6828-479X/keywords"/>
+        <external-identifier:external-identifiers path="/0000-0001-6828-479X/external-identifiers"/>
+    </person:person>
+    <activities:activities-summary path="/0000-0001-6828-479X/activities">
+        <activities:distinctions path="/0000-0001-6828-479X/distinctions"/>
+        <activities:educations path="/0000-0001-6828-479X/educations"/>
+        <activities:employments path="/0000-0001-6828-479X/employments"/>
+        <activities:fundings path="/0000-0001-6828-479X/fundings"/>
+        <activities:invited-positions path="/0000-0001-6828-479X/invited-positions"/>
+        <activities:memberships path="/0000-0001-6828-479X/memberships"/>
+        <activities:peer-reviews path="/0000-0001-6828-479X/peer-reviews"/>
+        <activities:qualifications path="/0000-0001-6828-479X/qualifications"/>
+        <activities:research-resources path="/0000-0001-6828-479X/research-resources"/>
+        <activities:services path="/0000-0001-6828-479X/services"/>
+        <activities:works path="/0000-0001-6828-479X/works"/>
+    </activities:activities-summary>
+</record:record>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml
new file mode 100644
index 000000000..a5eaff8a0
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/summary_error.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<error:error xmlns:address="http://www.orcid.org/ns/address"
+    xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
+    xmlns:employment="http://www.orcid.org/ns/employment"
+    xmlns:education="http://www.orcid.org/ns/education"
+    xmlns:other-name="http://www.orcid.org/ns/other-name"
+    xmlns:deprecated="http://www.orcid.org/ns/deprecated"
+    xmlns:funding="http://www.orcid.org/ns/funding"
+    xmlns:research-resource="http://www.orcid.org/ns/research-resource"
+    xmlns:service="http://www.orcid.org/ns/service"
+    xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
+    xmlns:distinction="http://www.orcid.org/ns/distinction"
+    xmlns:internal="http://www.orcid.org/ns/internal"
+    xmlns:membership="http://www.orcid.org/ns/membership"
+    xmlns:person="http://www.orcid.org/ns/person"
+    xmlns:personal-details="http://www.orcid.org/ns/personal-details"
+    xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
+    xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
+    xmlns:activities="http://www.orcid.org/ns/activities"
+    xmlns:qualification="http://www.orcid.org/ns/qualification"
+    xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
+    xmlns:error="http://www.orcid.org/ns/error"
+    xmlns:preferences="http://www.orcid.org/ns/preferences"
+    xmlns:invited-position="http://www.orcid.org/ns/invited-position"
+    xmlns:work="http://www.orcid.org/ns/work"
+    xmlns:peer-review="http://www.orcid.org/ns/peer-review">
+    <error:response-code>409</error:response-code>
+    <error:developer-message>409 Conflict: The ORCID record is locked and cannot be edited. ORCID
+        https://orcid.org/0000-0002-9716-679X</error:developer-message>
+    <error:user-message>The ORCID record is locked.</error:user-message>
+    <error:error-code>9018</error:error-code>
+    <error:more-info>https://members.orcid.org/api/resources/troubleshooting</error:more-info>
+</error:error>
\ No newline at end of file