used vtd for parsing orcid xml record, set 4g heapspace
This commit is contained in:
parent
5d46ec7d5f
commit
7d759947ae
|
@ -1,75 +1,78 @@
|
|||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
public class OrcidDSManager {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(OrcidDSManager.class);
|
||||
|
||||
private String hdfsServerUri;
|
||||
private String hdfsServerUri;
|
||||
private String hdfsOrcidDefaultPath;
|
||||
private String summariesFileNameTarGz;
|
||||
private String outputAuthorsPath;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
logger.info("OrcidDSManager started");
|
||||
OrcidDSManager orcidDSManager = new OrcidDSManager();
|
||||
orcidDSManager.loadArgs(args);
|
||||
orcidDSManager.generateAuthors();
|
||||
OrcidDSManager orcidDSManager = new OrcidDSManager();
|
||||
orcidDSManager.loadArgs(args);
|
||||
orcidDSManager.generateAuthors();
|
||||
}
|
||||
|
||||
public void generateAuthors() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
||||
logger.info("Started parsing "+tarGzUri);
|
||||
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors.seq"));
|
||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
||||
Path outputPath =
|
||||
new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(outputAuthorsPath)
|
||||
.concat("authors.seq"));
|
||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||
}
|
||||
|
||||
|
||||
private Configuration initConfigurationObject() {
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
return conf;
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
return conf;
|
||||
}
|
||||
|
||||
|
||||
private FileSystem initFileSystemObject(Configuration conf) {
|
||||
//Get the filesystem - HDFS
|
||||
FileSystem fs = null;
|
||||
try {
|
||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
return fs;
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem fs = null;
|
||||
try {
|
||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
return fs;
|
||||
}
|
||||
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(OrcidDSManager.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
|
||||
final ArgumentApplicationParser parser =
|
||||
new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
OrcidDSManager.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||
logger.info("HDFS URI: "+hdfsServerUri);
|
||||
Path hdfsOrcidDefaultPath = new Path(parser.get("hdfsOrcidDefaultPath"));
|
||||
logger.info("Default Path: "+hdfsOrcidDefaultPath);
|
||||
final String summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||
logger.info("Summaries File Name: "+summariesFileNameTarGz);
|
||||
final String outputAuthorsPath = parser.get("summariesFileNameTarGz");
|
||||
logger.info("Output Authors Data: "+outputAuthorsPath);
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||
Log.info("Output Authors Data: " + outputAuthorsPath);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,11 @@ package eu.dnetlib.doiboost.orcid;
|
|||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URI;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -13,23 +18,14 @@ import org.apache.hadoop.io.SequenceFile;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URI;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
public class SummariesDecompressor {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(SummariesDecompressor.class);
|
||||
|
||||
public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) throws Exception {
|
||||
|
||||
private static final int MAX_XML_RECORDS_PARSED = -1;
|
||||
|
||||
public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
|
@ -42,101 +38,123 @@ public class SummariesDecompressor {
|
|||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
parseTarSummaries(fs, conf, gzipInputStream, outputPath);
|
||||
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
parseTarSummaries(fs, conf, gzipInputStream, outputPath);
|
||||
|
||||
} finally {
|
||||
logger.debug("Closing gzip stream");
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
private static void parseTarSummaries(FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||
int counter = 0;
|
||||
int nameFound = 0;
|
||||
int surnameFound = 0;
|
||||
int creditNameFound = 0;
|
||||
int errorFromOrcidFound = 0;
|
||||
int xmlParserErrorFound = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
|
||||
SequenceFile.Writer.file(outputPath), SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
if (entry.isDirectory()) {
|
||||
logger.debug("Directory entry name: "+entry.getName());
|
||||
} else {
|
||||
logger.debug("XML record entry name: "+entry.getName());
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
try (ByteArrayInputStream bais = new ByteArrayInputStream(buffer.toString().getBytes())) {
|
||||
AuthorData authorData = XMLRecordParser.parse(bais);
|
||||
if (authorData!=null) {
|
||||
if (authorData.getErrorCode()!=null) {
|
||||
errorFromOrcidFound+=1;
|
||||
logger.debug("error from Orcid with code "+authorData.getErrorCode()+" for oid "+entry.getName());
|
||||
continue;
|
||||
}
|
||||
String jsonData = JsonWriter.create(authorData);
|
||||
logger.debug("oid: "+authorData.getOid() + " data: "+jsonData);
|
||||
|
||||
final Text key = new Text(authorData.getOid());
|
||||
final Text value = new Text(jsonData);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
logger.error("Writing to sequence file: "+e.getMessage());
|
||||
e.printStackTrace();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
if (authorData.getName()!=null) {
|
||||
nameFound+=1;
|
||||
}
|
||||
if (authorData.getSurname()!=null) {
|
||||
surnameFound+=1;
|
||||
}
|
||||
if (authorData.getCreditName()!=null) {
|
||||
creditNameFound+=1;
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
logger.error("Data not retrievable ["+entry.getName()+"] "+buffer.toString());
|
||||
xmlParserErrorFound+=1;
|
||||
}
|
||||
|
||||
} catch (XPathExpressionException | ParserConfigurationException | SAXException e) {
|
||||
logger.error("Parsing record from tar archive: "+e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
if ((counter % 100000) == 0) {
|
||||
logger.info("Current xml records parsed: "+counter);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("Parsing record from gzip archive: "+e.getMessage());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
logger.info("Summaries parse completed");
|
||||
logger.info("Total XML records parsed: "+counter);
|
||||
logger.info("Name found: "+nameFound);
|
||||
logger.info("Surname found: "+surnameFound);
|
||||
logger.info("Credit name found: "+creditNameFound);
|
||||
logger.info("Error from Orcid found: "+errorFromOrcidFound);
|
||||
logger.info("Error parsing xml record found: "+xmlParserErrorFound);
|
||||
|
||||
private static void parseTarSummaries(
|
||||
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||
int counter = 0;
|
||||
int nameFound = 0;
|
||||
int surnameFound = 0;
|
||||
int creditNameFound = 0;
|
||||
int errorFromOrcidFound = 0;
|
||||
int xmlParserErrorFound = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
|
||||
try (SequenceFile.Writer writer =
|
||||
SequenceFile.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
try {
|
||||
if (entry.isDirectory()) {
|
||||
Log.debug("Directory entry name: " + entry.getName());
|
||||
} else {
|
||||
Log.debug("XML record entry name: " + entry.getName());
|
||||
counter++;
|
||||
BufferedReader br =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
tais)); // Read directly from tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
AuthorData authorData =
|
||||
XMLRecordParser.VTDParse(buffer.toString().getBytes());
|
||||
if (authorData != null) {
|
||||
if (authorData.getErrorCode() != null) {
|
||||
errorFromOrcidFound += 1;
|
||||
Log.debug(
|
||||
"error from Orcid with code "
|
||||
+ authorData.getErrorCode()
|
||||
+ " for oid "
|
||||
+ entry.getName());
|
||||
continue;
|
||||
}
|
||||
String jsonData = JsonWriter.create(authorData);
|
||||
Log.debug("oid: " + authorData.getOid() + " data: " + jsonData);
|
||||
|
||||
final Text key = new Text(authorData.getOid());
|
||||
final Text value = new Text(jsonData);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||
Log.debug(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
if (authorData.getName() != null) {
|
||||
nameFound += 1;
|
||||
}
|
||||
if (authorData.getSurname() != null) {
|
||||
surnameFound += 1;
|
||||
}
|
||||
if (authorData.getCreditName() != null) {
|
||||
creditNameFound += 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
Log.warn(
|
||||
"Data not retrievable ["
|
||||
+ entry.getName()
|
||||
+ "] "
|
||||
+ buffer.toString());
|
||||
xmlParserErrorFound += 1;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log.warn(
|
||||
"Parsing record from tar archive and xml record: "
|
||||
+ filename
|
||||
+ " "
|
||||
+ e.getMessage());
|
||||
Log.warn(e);
|
||||
}
|
||||
|
||||
if ((counter % 100000) == 0) {
|
||||
Log.info("Current xml records parsed: " + counter);
|
||||
}
|
||||
|
||||
if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.warn("Parsing record from gzip archive: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
Log.info("Summaries parse completed");
|
||||
Log.info("Total XML records parsed: " + counter);
|
||||
Log.info("Name found: " + nameFound);
|
||||
Log.info("Surname found: " + surnameFound);
|
||||
Log.info("Credit name found: " + creditNameFound);
|
||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,98 +1,81 @@
|
|||
package eu.dnetlib.doiboost.orcid.xml;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import javax.xml.namespace.NamespaceContext;
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.EOFException;
|
||||
import com.ximpleware.EncodingException;
|
||||
import com.ximpleware.EntityException;
|
||||
import com.ximpleware.ParseException;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.w3c.dom.Document;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class XMLRecordParser {
|
||||
|
||||
public static AuthorData parse(ByteArrayInputStream bytesStream) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException {
|
||||
bytesStream.reset();
|
||||
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
|
||||
builderFactory.setNamespaceAware(true);
|
||||
DocumentBuilder builder = builderFactory.newDocumentBuilder();
|
||||
|
||||
Document xmlDocument = builder.parse(bytesStream);
|
||||
XPath xPath = XPathFactory.newInstance().newXPath();
|
||||
xPath.setNamespaceContext(new NamespaceContext() {
|
||||
@Override
|
||||
public Iterator getPrefixes(String arg0) {
|
||||
return null;
|
||||
}
|
||||
@Override
|
||||
public String getPrefix(String arg0) {
|
||||
return null;
|
||||
}
|
||||
@Override
|
||||
public String getNamespaceURI(String arg0) {
|
||||
if ("common".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/common";
|
||||
}
|
||||
else if ("person".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/person";
|
||||
}
|
||||
else if ("personal-details".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/personal-details";
|
||||
}
|
||||
else if ("other-name".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/other-name";
|
||||
}
|
||||
else if ("record".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/record";
|
||||
}
|
||||
else if ("error".equals(arg0)) {
|
||||
return "http://www.orcid.org/ns/error";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
AuthorData authorData = new AuthorData();
|
||||
String errorPath = "//error:response-code";
|
||||
String error = (String)xPath.compile(errorPath).evaluate(xmlDocument, XPathConstants.STRING);
|
||||
if (!StringUtils.isBlank(error)) {
|
||||
authorData.setErrorCode(error);
|
||||
return authorData;
|
||||
}
|
||||
String oidPath = "//record:record/@path";
|
||||
String oid = (String)xPath.compile(oidPath).evaluate(xmlDocument, XPathConstants.STRING);
|
||||
if (!StringUtils.isBlank(oid)) {
|
||||
oid = oid.substring(1);
|
||||
authorData.setOid(oid);
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
String namePath = "//personal-details:given-names";
|
||||
String name = (String)xPath.compile(namePath).evaluate(xmlDocument, XPathConstants.STRING);
|
||||
if (!StringUtils.isBlank(name)) {
|
||||
authorData.setName(name);
|
||||
}
|
||||
String surnamePath = "//personal-details:family-name";
|
||||
String surname = (String)xPath.compile(surnamePath).evaluate(xmlDocument, XPathConstants.STRING);
|
||||
if (!StringUtils.isBlank(surname)) {
|
||||
authorData.setSurname(surname);
|
||||
}
|
||||
String creditnamePath = "//personal-details:credit-name";
|
||||
String creditName = (String)xPath.compile(creditnamePath).evaluate(xmlDocument, XPathConstants.STRING);
|
||||
if (!StringUtils.isBlank(creditName)) {
|
||||
authorData.setCreditName(creditName);
|
||||
}
|
||||
return authorData;
|
||||
}
|
||||
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
|
||||
private static final String NS_COMMON = "common";
|
||||
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
|
||||
private static final String NS_PERSON = "person";
|
||||
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
|
||||
private static final String NS_DETAILS = "personal-details";
|
||||
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
|
||||
private static final String NS_OTHER = "other-name";
|
||||
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
|
||||
private static final String NS_RECORD = "record";
|
||||
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
|
||||
private static final String NS_ERROR = "error";
|
||||
|
||||
public static AuthorData VTDParse(byte[] bytes)
|
||||
throws VtdException, EncodingException, EOFException, EntityException, ParseException {
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(true);
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||
ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
|
||||
ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
|
||||
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
|
||||
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
|
||||
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||
|
||||
AuthorData authorData = new AuthorData();
|
||||
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||
if (!errors.isEmpty()) {
|
||||
authorData.setErrorCode(errors.get(0));
|
||||
return authorData;
|
||||
}
|
||||
|
||||
List<VtdUtilityParser.Node> recordNodes =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(
|
||||
ap, vn, "//record:record", Arrays.asList("path"));
|
||||
if (!recordNodes.isEmpty()) {
|
||||
final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
|
||||
authorData.setOid(oid);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
final List<String> names =
|
||||
VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
|
||||
if (!names.isEmpty()) {
|
||||
authorData.setName(names.get(0));
|
||||
}
|
||||
|
||||
final List<String> surnames =
|
||||
VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
|
||||
if (!surnames.isEmpty()) {
|
||||
authorData.setSurname(surnames.get(0));
|
||||
}
|
||||
|
||||
final List<String> creditNames =
|
||||
VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
|
||||
if (!creditNames.isEmpty()) {
|
||||
authorData.setCreditName(creditNames.get(0));
|
||||
}
|
||||
return authorData;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,4 +15,8 @@
|
|||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx4g</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,18 +1,18 @@
|
|||
<workflow-app name="import Crossref from index into HDFS" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/output'/>
|
||||
|
@ -21,9 +21,9 @@
|
|||
<ok to="ImportOrcidSummary"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<action name="ImportOrcidSummary">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
|
@ -31,8 +31,8 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||
<arg>-d</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>output/</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>output/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.doiboost.orcid.xml;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class XMLRecordParserTest {
|
||||
|
||||
@Test
|
||||
public void testOrcidXMLRecordParser() throws Exception {
|
||||
|
||||
String xml =
|
||||
IOUtils.toString(
|
||||
this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
|
||||
|
||||
XMLRecordParser p = new XMLRecordParser();
|
||||
|
||||
AuthorData authorData = p.VTDParse(xml.getBytes());
|
||||
assertNotNull(authorData);
|
||||
assertNotNull(authorData.getName());
|
||||
System.out.println("name: " + authorData.getName());
|
||||
assertNotNull(authorData.getSurname());
|
||||
System.out.println("surname: " + authorData.getSurname());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOrcidXMLErrorRecordParser() throws Exception {
|
||||
|
||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
|
||||
|
||||
XMLRecordParser p = new XMLRecordParser();
|
||||
|
||||
AuthorData authorData = p.VTDParse(xml.getBytes());
|
||||
assertNotNull(authorData);
|
||||
assertNotNull(authorData.getErrorCode());
|
||||
System.out.println("error: " + authorData.getErrorCode());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<record:record xmlns:address="http://www.orcid.org/ns/address"
|
||||
xmlns:email="http://www.orcid.org/ns/email
|
||||
" xmlns:history="http://www.orcid.org/ns/history"
|
||||
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||
xmlns:education="http://www.orcid.org/ns/education"
|
||||
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||
xmlns:service="http://www.orcid.org/ns/service"
|
||||
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||
xmlns:person="http://www.orcid.org/ns/person"
|
||||
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||
xmlns:error="http://www.orcid.org/ns/error"
|
||||
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||
xmlns:work="http://www.orcid.org/ns/work"
|
||||
xmlns:peer-review="http://www.orcid.org/ns/peer-review" path="/0000-0001-6828-479X">
|
||||
<common:orcid-identifier>
|
||||
<common:uri>https://orcid.org/0000-0001-6828-479X</common:uri>
|
||||
<common:path>0000-0001-6828-479X</common:path>
|
||||
<common:host>orcid.org</common:host>
|
||||
</common:orcid-identifier>
|
||||
<preferences:preferences>
|
||||
<preferences:locale>en</preferences:locale>
|
||||
</preferences:preferences>
|
||||
<history:history>
|
||||
<history:creation-method>Member-referred</history:creation-method>
|
||||
<history:submission-date>2017-02-17T06:16:06.428Z</history:submission-date>
|
||||
<common:last-modified-date>2017-10-04T04:38:43.529Z</common:last-modified-date>
|
||||
<history:claimed>true</history:claimed>
|
||||
<history:verified-email>true</history:verified-email>
|
||||
<history:verified-primary-email>true</history:verified-primary-email>
|
||||
</history:history>
|
||||
<person:person path="/0000-0001-6828-479X/person">
|
||||
<person:name visibility="public" path="0000-0001-6828-479X">
|
||||
<common:created-date>2017-02-17T06:16:06.428Z</common:created-date>
|
||||
<common:last-modified-date>2017-02-17T06:16:06.652Z</common:last-modified-date>
|
||||
<personal-details:given-names>Masahide</personal-details:given-names>
|
||||
<personal-details:family-name>Terazima</personal-details:family-name>
|
||||
</person:name>
|
||||
<other-name:other-names path="/0000-0001-6828-479X/other-names"/>
|
||||
<researcher-url:researcher-urls path="/0000-0001-6828-479X/researcher-urls"/>
|
||||
<email:emails path="/0000-0001-6828-479X/email"/>
|
||||
<address:addresses path="/0000-0001-6828-479X/address"/>
|
||||
<keyword:keywords path="/0000-0001-6828-479X/keywords"/>
|
||||
<external-identifier:external-identifiers path="/0000-0001-6828-479X/external-identifiers"/>
|
||||
</person:person>
|
||||
<activities:activities-summary path="/0000-0001-6828-479X/activities">
|
||||
<activities:distinctions path="/0000-0001-6828-479X/distinctions"/>
|
||||
<activities:educations path="/0000-0001-6828-479X/educations"/>
|
||||
<activities:employments path="/0000-0001-6828-479X/employments"/>
|
||||
<activities:fundings path="/0000-0001-6828-479X/fundings"/>
|
||||
<activities:invited-positions path="/0000-0001-6828-479X/invited-positions"/>
|
||||
<activities:memberships path="/0000-0001-6828-479X/memberships"/>
|
||||
<activities:peer-reviews path="/0000-0001-6828-479X/peer-reviews"/>
|
||||
<activities:qualifications path="/0000-0001-6828-479X/qualifications"/>
|
||||
<activities:research-resources path="/0000-0001-6828-479X/research-resources"/>
|
||||
<activities:services path="/0000-0001-6828-479X/services"/>
|
||||
<activities:works path="/0000-0001-6828-479X/works"/>
|
||||
</activities:activities-summary>
|
||||
</record:record>
|
|
@ -0,0 +1,33 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<error:error xmlns:address="http://www.orcid.org/ns/address"
|
||||
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||
xmlns:education="http://www.orcid.org/ns/education"
|
||||
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||
xmlns:service="http://www.orcid.org/ns/service"
|
||||
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||
xmlns:person="http://www.orcid.org/ns/person"
|
||||
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||
xmlns:error="http://www.orcid.org/ns/error"
|
||||
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||
xmlns:work="http://www.orcid.org/ns/work"
|
||||
xmlns:peer-review="http://www.orcid.org/ns/peer-review">
|
||||
<error:response-code>409</error:response-code>
|
||||
<error:developer-message>409 Conflict: The ORCID record is locked and cannot be edited. ORCID
|
||||
https://orcid.org/0000-0002-9716-679X</error:developer-message>
|
||||
<error:user-message>The ORCID record is locked.</error:user-message>
|
||||
<error:error-code>9018</error:error-code>
|
||||
<error:more-info>https://members.orcid.org/api/resources/troubleshooting</error:more-info>
|
||||
</error:error>
|
Loading…
Reference in New Issue