diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java new file mode 100644 index 000000000..da44f5795 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -0,0 +1,150 @@ +package eu.dnetlib.doiboost.orcid; + +import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcid.model.WorkData; +import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.mortbay.log.Log; + +public class ActivitiesDecompressor { + + private static final int MAX_XML_WORKS_PARSED = -1; + + public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarActivities(fs, conf, gzipInputStream, outputPath); + + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } + + private static void parseTarActivities( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int doiFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + + try { + if (entry.isDirectory() || !filename.contains("works")) { + + } else { + Log.debug("XML work entry name: " + entry.getName()); + counter++; + BufferedReader br = + new BufferedReader( + new InputStreamReader( + tais)); // Read directly from tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + WorkData workData = + XMLRecordParser.VTDParseWorkData(buffer.toString().getBytes()); + if (workData != null) { + if (workData.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log.debug( + "error from Orcid with code " + + workData.getErrorCode() + + " for entry " + + entry.getName()); + continue; + } + if (workData.isDoiFound()) { + String jsonData = JsonWriter.create(workData); + Log.debug("oid: " + workData.getOid() + " data: " + jsonData); + + final Text key = new Text(workData.getOid()); + final Text value = new Text(jsonData); + + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } + doiFound += 1; + } + + } else { + Log.warn( + "Data not retrievable [" + + entry.getName() + + "] " + + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log.warn( + "Parsing work from tar archive and xml work: " + + filename + + " " + + e.getMessage()); + Log.warn(e); + } + + if ((counter % 100000) == 0) { + Log.info("Current xml works parsed: " + counter); + } + + if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing work from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Activities parse completed"); + Log.info("Total XML works parsed: " + counter); + Log.info("Total doi found: " + doiFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml work found: " + xmlParserErrorFound); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java new file mode 100644 index 000000000..7596cf67f --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java @@ -0,0 +1,53 @@ +package eu.dnetlib.doiboost.orcid; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import java.io.IOException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { + + private String activitiesFileNameTarGz; + private String outputAuthorsDOIsPath; + + public static void main(String[] args) throws IOException, Exception { + OrcidAuthorsDOIsDataGen orcidAuthorsDOIsDataGen = new OrcidAuthorsDOIsDataGen(); + orcidAuthorsDOIsDataGen.loadArgs(args); + orcidAuthorsDOIsDataGen.generateAuthorsDOIsData(); + } + + public void generateAuthorsDOIsData() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = + hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); + Path outputPath = + new Path( + hdfsServerUri + .concat(hdfsOrcidDefaultPath) + .concat(outputAuthorsDOIsPath) + .concat("authors_dois.seq")); + ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + OrcidAuthorsDOIsDataGen.class.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); + Log.info("Default Path: " + hdfsOrcidDefaultPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); + Log.info("Output Authors DOIs Data: " + outputAuthorsDOIsPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index a55ed0b65..e89b9300d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -11,8 +11,8 @@ import org.mortbay.log.Log; public class OrcidDSManager { - private String hdfsServerUri; - private String hdfsOrcidDefaultPath; + protected String hdfsServerUri; + protected String hdfsOrcidDefaultPath; private String summariesFileNameTarGz; private String outputAuthorsPath; @@ -35,7 +35,7 @@ public class OrcidDSManager { SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); } - private Configuration initConfigurationObject() { + protected Configuration initConfigurationObject() { // ====== Init HDFS File System Object Configuration conf = new Configuration(); // Set FileSystem URI @@ -46,7 +46,7 @@ public class OrcidDSManager { return conf; } - private FileSystem initFileSystemObject(Configuration conf) { + protected FileSystem initFileSystemObject(Configuration conf) { // Get the filesystem - HDFS FileSystem fs = null; try { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index a5343a557..286906f27 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -82,7 +82,8 @@ public class SummariesDecompressor { buffer.append(line); } AuthorData authorData = - XMLRecordParser.VTDParse(buffer.toString().getBytes()); + XMLRecordParser.VTDParseAuthorData( + buffer.toString().getBytes()); if (authorData != null) { if (authorData.getErrorCode() != null) { errorFromOrcidFound += 1; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java index 395f0c0cd..a6077139b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java @@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.JsonObject; import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcid.model.WorkData; public class JsonWriter { @@ -15,4 +16,11 @@ public class JsonWriter { } return author.toString(); } + + public static String create(WorkData workData) { + JsonObject work = new JsonObject(); + work.addProperty("oid", workData.getOid()); + work.addProperty("doi", workData.getDoi()); + return work.toString(); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java new file mode 100644 index 000000000..42f0c1763 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java @@ -0,0 +1,42 @@ +package eu.dnetlib.doiboost.orcid.model; + +public class WorkData { + + private String oid; + private String doi; + private boolean doiFound = false; + + public boolean isDoiFound() { + return doiFound; + } + + public void setDoiFound(boolean doiFound) { + this.doiFound = doiFound; + } + + public String getOid() { + return oid; + } + + public void setOid(String oid) { + this.oid = oid; + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getErrorCode() { + return errorCode; + } + + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } + + private String errorCode; +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index bdaba8202..48971892a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -10,6 +10,7 @@ import com.ximpleware.VTDNav; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcid.model.WorkData; import java.util.Arrays; import java.util.List; @@ -26,9 +27,13 @@ public class XMLRecordParser { private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; private static final String NS_RECORD = "record"; private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + + private static final String NS_WORK = "work"; + private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; + private static final String NS_ERROR = "error"; - public static AuthorData VTDParse(byte[] bytes) + public static AuthorData VTDParseAuthorData(byte[] bytes) throws VtdException, EncodingException, EOFException, EntityException, ParseException { final VTDGen vg = new VTDGen(); vg.setDoc(bytes); @@ -78,4 +83,44 @@ public class XMLRecordParser { } return authorData; } + + public static WorkData VTDParseWorkData(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + + WorkData workData = new WorkData(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + workData.setErrorCode(errors.get(0)); + return workData; + } + + List workNodes = + VtdUtilityParser.getTextValuesWithAttributes( + ap, vn, "//work:work", Arrays.asList("path")); + if (!workNodes.isEmpty()) { + final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; + workData.setOid(oid); + } else { + return null; + } + + final List dois = + VtdUtilityParser.getTextValue( + ap, + vn, + "//common:external-id-type[text()=\"doi\"]/../common:external-id-value"); + if (!dois.isEmpty()) { + workData.setDoi(dois.get(0)); + workData.setDoiFound(true); + } + return workData; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json new file mode 100644 index 000000000..131c30125 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json @@ -0,0 +1,6 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml new file mode 100644 index 000000000..5621415d9 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.action.sharelib.for.java + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml new file mode 100644 index 000000000..5d7222d07 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml @@ -0,0 +1,39 @@ + + + + workingPath_activities + the working dir base path + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen + -d${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_0.tar.gz + -ooutput/ + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index 1d3323b61..15ab461a2 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -3,13 +3,14 @@ package eu.dnetlib.doiboost.orcid.xml; import static org.junit.jupiter.api.Assertions.assertNotNull; import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcid.model.WorkData; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; public class XMLRecordParserTest { @Test - public void testOrcidXMLRecordParser() throws Exception { + public void testOrcidAuthorDataXMLParser() throws Exception { String xml = IOUtils.toString( @@ -17,7 +18,7 @@ public class XMLRecordParserTest { XMLRecordParser p = new XMLRecordParser(); - AuthorData authorData = p.VTDParse(xml.getBytes()); + AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); assertNotNull(authorData); assertNotNull(authorData.getName()); System.out.println("name: " + authorData.getName()); @@ -32,9 +33,27 @@ public class XMLRecordParserTest { XMLRecordParser p = new XMLRecordParser(); - AuthorData authorData = p.VTDParse(xml.getBytes()); + AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); assertNotNull(authorData); assertNotNull(authorData.getErrorCode()); System.out.println("error: " + authorData.getErrorCode()); } + + @Test + public void testOrcidWorkDataXMLParser() throws Exception { + + String xml = + IOUtils.toString( + this.getClass() + .getResourceAsStream("activity_work_0000-0002-5982-8983.xml")); + + XMLRecordParser p = new XMLRecordParser(); + + WorkData workData = p.VTDParseWorkData(xml.getBytes()); + assertNotNull(workData); + assertNotNull(workData.getOid()); + System.out.println("oid: " + workData.getOid()); + assertNotNull(workData.getDoi()); + System.out.println("doi: " + workData.getDoi()); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml new file mode 100644 index 000000000..63b4405f1 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/xml/activity_work_0000-0002-5982-8983.xml @@ -0,0 +1,79 @@ + + + 2018-11-01T19:49:45.562Z + 2018-11-01T19:49:45.562Z + + + https://orcid.org/client/0000-0002-5982-8983 + 0000-0002-5982-8983 + orcid.org + + Scopus - Elsevier + + + "Calling Out" in class: Degrees of candor in addressing social injustices in + racially homogenous and heterogeneous U.S. history classrooms + + Journal of Social Studies Research + + bibtex + @article{Massaro2018,title = {{"}Calling Out{"} in class: Degrees of + candor in addressing social injustices in racially homogenous and heterogeneous U.S. + history classrooms},journal = {Journal of Social Studies Research},year = {2018},author + = {Parkhouse, H. and Massaro, V.R.}} + + journal-article + + 2018 + + + + doi + 10.1016/j.jssr.2018.01.004 + 10.1016/j.jssr.2018.01.004 + self + + + eid + 2-s2.0-85041949043 + 2-s2.0-85041949043 + self + + + http://www.scopus.com/inward/record.url?eid=2-s2.0-85041949043&partnerID=MN8TOARS + + + Parkhouse, H. + + + Massaro, V.R. + + + \ No newline at end of file