forked from D-Net/dnet-hadoop
first version of dataset successful generated from orcid dump 2020
This commit is contained in:
parent
9818e74a70
commit
6bc7dbeca7
|
@ -51,7 +51,6 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>${org.apache.httpcomponents.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
|
@ -87,7 +86,6 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
<version>${common.text.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ public class OrcidDSManager {
|
|||
.toString(
|
||||
OrcidDSManager.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
|
|
|
@ -73,7 +73,7 @@ public class ActivitiesDumpReader {
|
|||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
try {
|
||||
if (entry.isDirectory() || !filename.contains("works")) {
|
||||
|
||||
|
@ -83,7 +83,7 @@ public class ActivitiesDumpReader {
|
|||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||
// tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager {
|
|||
.toString(
|
||||
GenOrcidAuthorWork.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
|
|
|
@ -67,7 +67,7 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||
.sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class);
|
||||
.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
|
||||
Dataset<AuthorData> summariesDataset = spark
|
||||
.createDataset(
|
||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||
|
@ -96,8 +96,8 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD();
|
||||
enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||
logger.info("Works enriched data saved");
|
||||
// enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath);
|
||||
logger.info("Enriched works RDD ready.");
|
||||
|
||||
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
|
||||
final LongAccumulator enrichedPublications = spark
|
||||
|
@ -132,7 +132,7 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
.write()
|
||||
.format("parquet")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(workingPath + "no_doi_dataset/output");
|
||||
.save(workingPath + outputEnrichedWorksPath);
|
||||
|
||||
logger.info("parsedPublications: " + parsedPublications.value().toString());
|
||||
logger.info("enrichedPublications: " + enrichedPublications.value().toString());
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.io.IOException;
|
|||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -40,7 +41,7 @@ public class AuthorMatcher {
|
|||
int matchCounter = 0;
|
||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||
Contributor contributor = null;
|
||||
contributors.forEach(c -> {
|
||||
contributors.stream().filter(c -> !StringUtils.isBlank(c.getCreditName())).forEach(c -> {
|
||||
if (simpleMatch(c.getCreditName(), author.getName()) ||
|
||||
simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||
simpleMatch(c.getCreditName(), author.getOtherName())) {
|
||||
|
@ -54,6 +55,7 @@ public class AuthorMatcher {
|
|||
Optional<Contributor> optCon = contributors
|
||||
.stream()
|
||||
.filter(c -> c.isSimpleMatch())
|
||||
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||
.map(c -> {
|
||||
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||
return c;
|
||||
|
|
|
@ -183,39 +183,34 @@ public class XMLRecordParserNoDoi {
|
|||
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
|
||||
throws XPathParseException, NavException, XPathEvalException {
|
||||
List<Contributor> contributors = new ArrayList<Contributor>();
|
||||
int nameIndex = 0;
|
||||
ap.selectXPath("//work:contributor/work:credit-name");
|
||||
ap.selectXPath("//work:contributors/work:contributor");
|
||||
while (ap.evalXPath() != -1) {
|
||||
Contributor contributor = new Contributor();
|
||||
int t = vn.getText();
|
||||
if (t >= 0) {
|
||||
contributor.setCreditName(vn.toNormalizedString(t));
|
||||
contributors.add(nameIndex, contributor);
|
||||
nameIndex++;
|
||||
if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
|
||||
int val = vn.getText();
|
||||
if (val != -1) {
|
||||
contributor.setCreditName(vn.toNormalizedString(val));
|
||||
}
|
||||
vn.toElement(VTDNav.PARENT);
|
||||
}
|
||||
}
|
||||
if (contributors.size() == 0) {
|
||||
return contributors;
|
||||
}
|
||||
|
||||
int sequenceIndex = 0;
|
||||
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-sequence");
|
||||
while (ap.evalXPath() != -1) {
|
||||
int t = vn.getText();
|
||||
if (t >= 0) {
|
||||
contributors.get(sequenceIndex).setSequence(vn.toNormalizedString(t));
|
||||
sequenceIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
int roleIndex = 0;
|
||||
ap.selectXPath("//work:contributor/work:contributor-attributes/work:contributor-role");
|
||||
while (ap.evalXPath() != -1) {
|
||||
int t = vn.getText();
|
||||
if (t >= 0) {
|
||||
contributors.get(roleIndex).setRole(vn.toNormalizedString(t));
|
||||
roleIndex++;
|
||||
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
|
||||
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
|
||||
int val = vn.getText();
|
||||
if (val != -1) {
|
||||
contributor.setSequence(vn.toNormalizedString(val));
|
||||
}
|
||||
vn.toElement(VTDNav.PARENT);
|
||||
}
|
||||
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
|
||||
int val = vn.getText();
|
||||
if (val != -1) {
|
||||
contributor.setRole(vn.toNormalizedString(val));
|
||||
}
|
||||
vn.toElement(VTDNav.PARENT);
|
||||
}
|
||||
vn.toElement(VTDNav.PARENT);
|
||||
}
|
||||
contributors.add(contributor);
|
||||
}
|
||||
return contributors;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||
]
|
|
@ -1,42 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,67 +0,0 @@
|
|||
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/summaries/output'/>
|
||||
<mkdir path='${workingPath}/summaries/output'/>
|
||||
</fs>
|
||||
<ok to="check_exist_on_hdfs_summaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_summaries">
|
||||
<switch>
|
||||
<case to="ImportOrcidSummaries">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||
</case>
|
||||
<default to="DownloadSummaries" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="DownloadSummaries">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="ImportOrcidSummaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOrcidSummaries">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>summaries/output/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -9,7 +9,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx4g</value>
|
||||
<value>-Xmx2g</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Import Orcid Activities" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
|
@ -6,67 +6,67 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_1</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_2</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_3</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_4</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_5</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_6</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_7</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_8</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_9</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_X</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||
</property>
|
||||
|
@ -82,11 +82,11 @@
|
|||
<fs>
|
||||
<delete path='${workingPath}/no_doi_works/*'/>
|
||||
</fs>
|
||||
<ok to="fork_gen_orcid_author_work"/>
|
||||
<ok to="fork_check_download_files"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name = "fork_gen_orcid_author_work">
|
||||
<fork name = "fork_check_download_files">
|
||||
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||
|
@ -102,8 +102,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_0">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_0">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_0" />
|
||||
</switch>
|
||||
|
@ -118,7 +118,7 @@
|
|||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_0"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -129,7 +129,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -139,8 +139,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_1">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_1">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_1" />
|
||||
</switch>
|
||||
|
@ -155,7 +155,7 @@
|
|||
<argument>${shell_cmd_1}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_1"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -166,7 +166,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -176,8 +176,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_2">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_2">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_2" />
|
||||
</switch>
|
||||
|
@ -192,7 +192,7 @@
|
|||
<argument>${shell_cmd_2}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_2"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -203,7 +203,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -213,8 +213,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_3">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_3">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_3" />
|
||||
</switch>
|
||||
|
@ -229,7 +229,7 @@
|
|||
<argument>${shell_cmd_3}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_3"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -240,7 +240,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -250,8 +250,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_4">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_4">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_4" />
|
||||
</switch>
|
||||
|
@ -266,7 +266,7 @@
|
|||
<argument>${shell_cmd_4}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_4"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -277,7 +277,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -287,8 +287,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_5">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_5">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_5" />
|
||||
</switch>
|
||||
|
@ -303,7 +303,7 @@
|
|||
<argument>${shell_cmd_5}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_5"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -314,7 +314,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -324,8 +324,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_6">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_6">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_6" />
|
||||
</switch>
|
||||
|
@ -340,7 +340,7 @@
|
|||
<argument>${shell_cmd_6}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_6"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -351,7 +351,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -362,8 +362,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_7">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_7">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_7" />
|
||||
</switch>
|
||||
|
@ -378,7 +378,7 @@
|
|||
<argument>${shell_cmd_7}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_7"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -389,7 +389,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -399,8 +399,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_8">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_8">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_8" />
|
||||
</switch>
|
||||
|
@ -415,7 +415,7 @@
|
|||
<argument>${shell_cmd_8}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_8"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -426,7 +426,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -436,8 +436,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_9">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_9">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_9" />
|
||||
</switch>
|
||||
|
@ -452,7 +452,7 @@
|
|||
<argument>${shell_cmd_9}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_9"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -463,7 +463,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -473,8 +473,8 @@
|
|||
|
||||
<decision name="check_exist_on_hdfs_activities_X">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_X">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))}
|
||||
<case to="wait_download_phase_node">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_X" />
|
||||
</switch>
|
||||
|
@ -489,7 +489,7 @@
|
|||
<argument>${shell_cmd_X}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_X"/>
|
||||
<ok to="wait_download_phase_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -500,7 +500,7 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</java>
|
||||
|
@ -508,7 +508,35 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
|
||||
|
||||
<fork name = "fork_gen_orcid_author_work">
|
||||
<path start = "GenOrcidAuthorWork_0"/>
|
||||
<path start = "GenOrcidAuthorWork_1"/>
|
||||
<path start = "GenOrcidAuthorWork_2"/>
|
||||
<path start = "GenOrcidAuthorWork_3"/>
|
||||
<path start = "GenOrcidAuthorWork_4"/>
|
||||
<path start = "GenOrcidAuthorWork_5"/>
|
||||
<path start = "GenOrcidAuthorWork_6"/>
|
||||
<path start = "GenOrcidAuthorWork_7"/>
|
||||
<path start = "GenOrcidAuthorWork_8"/>
|
||||
<path start = "GenOrcidAuthorWork_9"/>
|
||||
<path start = "GenOrcidAuthorWork_X"/>
|
||||
</fork>
|
||||
|
||||
<join name = "join_node" to = "End"/>
|
||||
|
||||
<!-- <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
|
||||
|
||||
<!-- <fork name = "fork_gen_orcid_author_work_2">-->
|
||||
<!-- <path start = "GenOrcidAuthorWork_6"/>-->
|
||||
<!-- <path start = "GenOrcidAuthorWork_7"/>-->
|
||||
<!-- <path start = "GenOrcidAuthorWork_8"/>-->
|
||||
<!-- <path start = "GenOrcidAuthorWork_9"/>-->
|
||||
<!-- <path start = "GenOrcidAuthorWork_X"/>-->
|
||||
<!-- </fork>-->
|
||||
|
||||
<!-- <join name = "join_node_2" to = "End"/>-->
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -19,4 +19,8 @@
|
|||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx16g</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Import Orcid Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
|
@ -6,7 +6,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz
|
||||
<value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||
</property>
|
||||
|
@ -21,8 +21,8 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/summaries/output'/>
|
||||
<mkdir path='${workingPath}/summaries/output'/>
|
||||
<delete path='${workingPath}/authors'/>
|
||||
<mkdir path='${workingPath}/authors'/>
|
||||
</fs>
|
||||
<ok to="check_exist_on_hdfs_summaries"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -31,7 +31,7 @@
|
|||
<decision name="check_exist_on_hdfs_summaries">
|
||||
<switch>
|
||||
<case to="ImportOrcidSummaries">
|
||||
${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))}
|
||||
${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
|
||||
</case>
|
||||
<default to="DownloadSummaries" />
|
||||
</switch>
|
||||
|
@ -57,8 +57,8 @@
|
|||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>summaries/output/</arg>
|
||||
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
|
||||
<arg>-o</arg><arg>authors/</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -59,7 +59,7 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/no_doi_enriched_works/output'/>
|
||||
<delete path='${workingPath}/no_doi_dataset'/>
|
||||
</fs>
|
||||
<ok to="GenOrcidNoDoiDataset"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -85,7 +85,7 @@
|
|||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>-</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/output</arg>
|
||||
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
|
@ -38,8 +38,8 @@ public class OrcidClientTest {
|
|||
|
||||
@Test
|
||||
public void downloadTest() throws Exception {
|
||||
String record = testDownloadRecord("0000-0002-2536-4498");
|
||||
File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml");
|
||||
String record = testDownloadRecord("0000-0001-6163-2042");
|
||||
File f = new File("/tmp/downloaded_0000-0001-6163-2042.xml");
|
||||
OutputStream outStream = new FileOutputStream(f);
|
||||
IOUtils.write(record.getBytes(), outStream);
|
||||
System.out.println("saved to tmp");
|
||||
|
|
|
@ -2,15 +2,20 @@
|
|||
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
|
||||
import javax.validation.constraints.AssertTrue;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.JaccardSimilarity;
|
||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mortbay.log.Log;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -41,7 +46,6 @@ public class OrcidNoDoiTest {
|
|||
String orcidIdA = "0000-0003-2760-1191";
|
||||
|
||||
@Test
|
||||
// @Ignore
|
||||
public void readPublicationFieldsTest()
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
logger.info("running loadPublicationFieldsTest ....");
|
||||
|
@ -95,8 +99,7 @@ public class OrcidNoDoiTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
// @Ignore
|
||||
private void authorMatchTest() throws Exception {
|
||||
public void authorMatchTest() throws Exception {
|
||||
logger.info("running authorSimpleMatchTest ....");
|
||||
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||
AuthorData author = new AuthorData();
|
||||
|
@ -121,9 +124,60 @@ public class OrcidNoDoiTest {
|
|||
logger.error("parsing xml", e);
|
||||
}
|
||||
assertNotNull(workData);
|
||||
|
||||
Contributor a = workData.getContributors().get(0);
|
||||
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
|
||||
|
||||
AuthorMatcher.match(author, workData.getContributors());
|
||||
GsonBuilder builder = new GsonBuilder();
|
||||
Gson gson = builder.create();
|
||||
logger.info(gson.toJson(workData));
|
||||
|
||||
assertTrue(workData.getContributors().size() == 6);
|
||||
Contributor c = workData.getContributors().get(0);
|
||||
assertTrue(c.getOid().equals("0000-0003-2760-1191"));
|
||||
assertTrue(c.getName().equals("Khairy"));
|
||||
assertTrue(c.getSurname().equals("Abdel Dayem"));
|
||||
assertTrue(c.getCreditName().equals("Abdel-Dayem K"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readContributorsTest()
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
logger.info("running loadPublicationFieldsTest ....");
|
||||
String xml = IOUtils
|
||||
.toString(
|
||||
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
|
||||
|
||||
if (xml == null) {
|
||||
logger.info("Resource not found");
|
||||
}
|
||||
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||
if (p == null) {
|
||||
logger.info("XMLRecordParserNoDoi null");
|
||||
}
|
||||
WorkDataNoDoi workData = null;
|
||||
try {
|
||||
workData = p.VTDParseWorkData(xml.getBytes());
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml", e);
|
||||
}
|
||||
assertNotNull(workData.getContributors());
|
||||
assertTrue(workData.getContributors().size() == 5);
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
|
||||
assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
|
||||
assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
|
||||
assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
|
||||
assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
|
||||
assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
|
||||
assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
|
||||
assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
|
||||
assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
|
||||
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
|
||||
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||
xmlns:education="http://www.orcid.org/ns/education"
|
||||
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||
xmlns:service="http://www.orcid.org/ns/service"
|
||||
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||
xmlns:person="http://www.orcid.org/ns/person"
|
||||
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||
xmlns:error="http://www.orcid.org/ns/error"
|
||||
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||
xmlns:work="http://www.orcid.org/ns/work"
|
||||
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
|
||||
path="/0000-0003-2760-1191/work/28776099" visibility="public">
|
||||
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
|
||||
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
|
||||
<common:source>
|
||||
<common:source-orcid>
|
||||
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
|
||||
<common:path>0000-0002-9157-3431</common:path>
|
||||
<common:host>orcid.org</common:host>
|
||||
</common:source-orcid>
|
||||
<common:source-name>Europe PubMed Central</common:source-name>
|
||||
</common:source>
|
||||
<work:title>
|
||||
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
|
||||
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
|
||||
ST-Segment-Elevation Myocardial Infarction.</common:title>
|
||||
</work:title>
|
||||
<work:citation>
|
||||
<work:citation-type>formatted-unspecified</work:citation-type>
|
||||
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
|
||||
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
|
||||
</work:citation>
|
||||
<work:type>journal-article</work:type>
|
||||
<common:publication-date>
|
||||
<common:year>2016</common:year>
|
||||
<common:month>11</common:month>
|
||||
</common:publication-date>
|
||||
<common:external-ids>
|
||||
<common:external-id>
|
||||
<common:external-id-type>pmid</common:external-id-type>
|
||||
<common:external-id-value>27899851</common:external-id-value>
|
||||
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
|
||||
<common:external-id-relationship>self</common:external-id-relationship>
|
||||
</common:external-id>
|
||||
<common:external-id>
|
||||
<common:external-id-type>pmc</common:external-id-type>
|
||||
<common:external-id-value>PMC5126442</common:external-id-value>
|
||||
<common:external-id-normalized transient="true"
|
||||
>PMC5126442</common:external-id-normalized>
|
||||
<common:external-id-relationship>self</common:external-id-relationship>
|
||||
</common:external-id>
|
||||
</common:external-ids>
|
||||
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||
<work:contributors>
|
||||
<work:contributor>
|
||||
<work:contributor-attributes>
|
||||
<work:contributor-sequence>seq0</work:contributor-sequence>
|
||||
<work:contributor-role>role0</work:contributor-role>
|
||||
</work:contributor-attributes>
|
||||
</work:contributor>
|
||||
<work:contributor>
|
||||
<work:credit-name>creditname1</work:credit-name>
|
||||
</work:contributor>
|
||||
<work:contributor>
|
||||
<work:credit-name>creditname2</work:credit-name>
|
||||
<work:contributor-attributes>
|
||||
<work:contributor-sequence>seq2</work:contributor-sequence>
|
||||
<work:contributor-role></work:contributor-role>
|
||||
</work:contributor-attributes>
|
||||
</work:contributor>
|
||||
<work:contributor>
|
||||
<work:credit-name>creditname3</work:credit-name>
|
||||
<work:contributor-attributes>
|
||||
<work:contributor-sequence></work:contributor-sequence>
|
||||
<work:contributor-role>role3</work:contributor-role>
|
||||
</work:contributor-attributes>
|
||||
</work:contributor>
|
||||
<work:contributor>
|
||||
<work:credit-name></work:credit-name>
|
||||
<work:contributor-attributes>
|
||||
<work:contributor-sequence>seq4</work:contributor-sequence>
|
||||
<work:contributor-role>role4</work:contributor-role>
|
||||
</work:contributor-attributes>
|
||||
</work:contributor>
|
||||
</work:contributors>
|
||||
</work:work>
|
12
pom.xml
12
pom.xml
|
@ -458,6 +458,18 @@
|
|||
<version>${jsonschemagenerator.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
<version>${common.text.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>${org.apache.httpcomponents.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
|
|
Loading…
Reference in New Issue