From ea9b00ce56bdc78db6e5b46af9b6bbebf74ad138 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 20 May 2021 15:31:42 +0200 Subject: [PATCH 1/9] adjusted test --- .../dnetlib/dhp/actionmanager/project/utils/EXCELParser.java | 2 +- .../dnetlib/dhp/actionmanager/project/EXCELParserTest.java | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index cc18c6f54..1a6ebb9e8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -32,7 +32,7 @@ public class EXCELParser { XSSFSheet sheet = wb.getSheet(sheetName); - if(sheetName == null){ + if (sheetName == null) { throw new RuntimeException("Sheet name " + sheetName + " not present in current file"); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 59b536cd5..72ba48f41 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -35,8 +35,9 @@ public class EXCELParserTest { EXCELParser excelParser = new EXCELParser(); - List pl = excelParser - .parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"); + final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"; + final String sheetName = "Topics"; + List pl = excelParser.parse(httpConnector.getInputSourceAsStream(URL), classForName, sheetName); Assertions.assertEquals(3837, pl.size()); From 4d6c473bf18a42a1e0fac7863eddae017d1a7eff Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 20 May 2021 18:26:42 +0200 Subject: [PATCH 2/9] removed redundant classes contained now in dhp-schema --- .../dhp/schema/orcid/AuthorHistory.java | 79 ------------- .../dhp/schema/orcid/AuthorSummary.java | 25 ---- .../dnetlib/dhp/schema/orcid/Contributor.java | 58 ---------- .../dnetlib/dhp/schema/orcid/ExternalId.java | 38 ------ .../dnetlib/dhp/schema/orcid/OrcidData.java | 34 ------ .../dhp/schema/orcid/PublicationDate.java | 38 ------ .../eu/dnetlib/dhp/schema/orcid/Work.java | 16 --- .../dnetlib/dhp/schema/orcid/WorkDetail.java | 109 ------------------ 8 files changed, 397 deletions(-) delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java deleted file mode 100644 index 554aae82c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java +++ /dev/null @@ -1,79 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -public class AuthorHistory implements Serializable { - private String creationMethod; - private String completionDate; - private String submissionDate; - private String lastModifiedDate; - private boolean claimed; - private String deactivationDate; - private boolean verifiedEmail; - private boolean verifiedPrimaryEmail; - - public String getCreationMethod() { - return creationMethod; - } - - public void setCreationMethod(String creationMethod) { - this.creationMethod = creationMethod; - } - - public String getCompletionDate() { - return completionDate; - } - - public void setCompletionDate(String completionDate) { - this.completionDate = completionDate; - } - - public String getSubmissionDate() { - return submissionDate; - } - - public void setSubmissionDate(String submissionDate) { - this.submissionDate = submissionDate; - } - - public String getLastModifiedDate() { - return lastModifiedDate; - } - - public void setLastModifiedDate(String lastModifiedDate) { - this.lastModifiedDate = lastModifiedDate; - } - - public boolean isClaimed() { - return claimed; - } - - public void setClaimed(boolean claimed) { - this.claimed = claimed; - } - - public String getDeactivationDate() { - return deactivationDate; - } - - public void setDeactivationDate(String deactivationDate) { - this.deactivationDate = deactivationDate; - } - - public boolean isVerifiedEmail() { - return verifiedEmail; - } - - public void setVerifiedEmail(boolean verifiedEmail) { - this.verifiedEmail = verifiedEmail; - } - - public boolean isVerifiedPrimaryEmail() { - return verifiedPrimaryEmail; - } - - public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) { - this.verifiedPrimaryEmail = verifiedPrimaryEmail; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java deleted file mode 100644 index 813aead49..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java +++ /dev/null @@ -1,25 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -public class AuthorSummary extends OrcidData implements Serializable { - AuthorData authorData; - AuthorHistory authorHistory; - - public AuthorData getAuthorData() { - return authorData; - } - - public void setAuthorData(AuthorData authorData) { - this.authorData = authorData; - } - - public AuthorHistory getAuthorHistory() { - return authorHistory; - } - - public void setAuthorHistory(AuthorHistory authorHistory) { - this.authorHistory = authorHistory; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java deleted file mode 100644 index 3b543db4b..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java +++ /dev/null @@ -1,58 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -import eu.dnetlib.dhp.schema.orcid.AuthorData; - -/** - * This class models the data related to a contributor, that are retrieved from an orcid publication - */ - -public class Contributor extends AuthorData implements Serializable { - private String sequence; - private String role; - private transient boolean simpleMatch; - private transient Double score; - private transient boolean bestMatch; - - public String getSequence() { - return sequence; - } - - public void setSequence(String sequence) { - this.sequence = sequence; - } - - public String getRole() { - return role; - } - - public void setRole(String role) { - this.role = role; - } - - public boolean isSimpleMatch() { - return simpleMatch; - } - - public void setSimpleMatch(boolean simpleMatch) { - this.simpleMatch = simpleMatch; - } - - public Double getScore() { - return score; - } - - public void setScore(Double score) { - this.score = score; - } - - public boolean isBestMatch() { - return bestMatch; - } - - public void setBestMatch(boolean bestMatch) { - this.bestMatch = bestMatch; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java deleted file mode 100644 index d8f001aa5..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java +++ /dev/null @@ -1,38 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -/** - * This class models the data related to external id, that are retrieved from an orcid publication - */ - -public class ExternalId implements Serializable { - private String type; - private String value; - private String relationShip; - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - public String getRelationShip() { - return relationShip; - } - - public void setRelationShip(String relationShip) { - this.relationShip = relationShip; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java deleted file mode 100644 index 606eea6a8..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -public class OrcidData implements Serializable { - protected String base64CompressData; - protected String statusCode; - protected String downloadDate; - - public String getBase64CompressData() { - return base64CompressData; - } - - public void setBase64CompressData(String base64CompressData) { - this.base64CompressData = base64CompressData; - } - - public String getStatusCode() { - return statusCode; - } - - public void setStatusCode(String statusCode) { - this.statusCode = statusCode; - } - - public String getDownloadDate() { - return downloadDate; - } - - public void setDownloadDate(String downloadDate) { - this.downloadDate = downloadDate; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java deleted file mode 100644 index 01972ce95..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java +++ /dev/null @@ -1,38 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -/** - * This class models the data related to a publication date, that are retrieved from an orcid publication - */ - -public class PublicationDate implements Serializable { - private String year; - private String month; - private String day; - - public String getYear() { - return year; - } - - public void setYear(String year) { - this.year = year; - } - - public String getMonth() { - return month; - } - - public void setMonth(String month) { - this.month = month; - } - - public String getDay() { - return day; - } - - public void setDay(String day) { - this.day = day; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java deleted file mode 100644 index c557eb5d2..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; - -public class Work extends OrcidData implements Serializable { - WorkDetail workDetail; - - public WorkDetail getWorkDetail() { - return workDetail; - } - - public void setWorkDetail(WorkDetail workDetail) { - this.workDetail = workDetail; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java deleted file mode 100644 index 614d415c1..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java +++ /dev/null @@ -1,109 +0,0 @@ - -package eu.dnetlib.dhp.schema.orcid; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.orcid.Contributor; -import eu.dnetlib.dhp.schema.orcid.ExternalId; -import eu.dnetlib.dhp.schema.orcid.OrcidData; -import eu.dnetlib.dhp.schema.orcid.PublicationDate; - -/** - * This class models the data that are retrieved from orcid publication - */ - -public class WorkDetail implements Serializable { - - private String oid; - private String id; - private String sourceName; - private String type; - private List titles; - private List urls; - List extIds; - List publicationDates; - List contributors; - - public String getOid() { - return oid; - } - - public void setOid(String oid) { - this.oid = oid; - } - - public String getErrorCode() { - return errorCode; - } - - public void setErrorCode(String errorCode) { - this.errorCode = errorCode; - } - - private String errorCode; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getTitles() { - return titles; - } - - public void setTitles(List titles) { - this.titles = titles; - } - - public String getSourceName() { - return sourceName; - } - - public void setSourceName(String sourceName) { - this.sourceName = sourceName; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public List getUrls() { - return urls; - } - - public void setUrls(List urls) { - this.urls = urls; - } - - public List getExtIds() { - return extIds; - } - - public void setExtIds(List extIds) { - this.extIds = extIds; - } - - public List getPublicationDates() { - return publicationDates; - } - - public void setPublicationDates(List publicationDates) { - this.publicationDates = publicationDates; - } - - public List getContributors() { - return contributors; - } - - public void setContributors(List contributors) { - this.contributors = contributors; - } -} From ae7bd24d79ccd30d978f080da3b623f5090a8d5a Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 20 May 2021 18:32:22 +0200 Subject: [PATCH 3/9] removed old workflows --- .../oozie_app/config-default.xml | 31 - .../orcid_activities/oozie_app/workflow.xml | 542 ------------------ .../oozie_app/config-default.xml | 22 - .../oozie_app/workflow.xml | 505 ---------------- .../oozie_app/workflow.xml | 99 ---- .../oozie_app/workflow.xml | 232 -------- .../oozie_app/config-default.xml | 26 - .../oozie_app/workflow.xml | 40 -- .../oozie_app/config-default.xml | 26 - .../orcid_summaries/oozie_app/workflow.xml | 68 --- 10 files changed, 1591 deletions(-) delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml deleted file mode 100644 index 05fe6d014..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml +++ /dev/null @@ -1,31 +0,0 @@ - - - oozie.action.sharelib.for.java - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx2g - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml deleted file mode 100644 index ea4d33296..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml +++ /dev/null @@ -1,542 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_0.tar.gz - -owno_doi_works/works_0.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_1.tar.gz - -owno_doi_works/works_1.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_2.tar.gz - -owno_doi_works/works_2.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_3.tar.gz - -owno_doi_works/works_3.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_4.tar.gz - -owno_doi_works/works_4.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_5.tar.gz - -owno_doi_works/works_5.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_6.tar.gz - -owno_doi_works/works_6.seq - -oewno_doi_enriched_works/ - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_7.tar.gz - -owno_doi_works/works_7.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_8.tar.gz - -owno_doi_works/works_8.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_9.tar.gz - -owno_doi_works/works_9.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_X.tar.gz - -owno_doi_works/works_X.seq - -oewno_doi_enriched_works/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml deleted file mode 100644 index 5621415d9..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/config-default.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.action.sharelib.for.java - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx4g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml deleted file mode 100644 index 1c2ae89dd..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_authors_dois_data/oozie_app/workflow.xml +++ /dev/null @@ -1,505 +0,0 @@ - - - - workingPath_activities - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_0.tar.gz - -ooutput/authors_dois_0.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_1.tar.gz - -ooutput/authors_dois_1.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_2.tar.gz - -ooutput/authors_dois_2.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_3.tar.gz - -ooutput/authors_dois_3.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_4.tar.gz - -ooutput/authors_dois_4.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_5.tar.gz - -ooutput/authors_dois_5.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_6.tar.gz - -ooutput/authors_dois_6.seq - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_7.tar.gz - -ooutput/authors_dois_7.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_8.tar.gz - -ooutput/authors_dois_8.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_9.tar.gz - -ooutput/authors_dois_9.seq - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidAuthorsDOIsDataGen - -d${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_X.tar.gz - -ooutput/authors_dois_X.seq - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml deleted file mode 100644 index 133a6f4bd..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml +++ /dev/null @@ -1,99 +0,0 @@ - - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - spark2MaxExecutors - 20 - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - workingPath - the working dir base path - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - yarn-cluster - cluster - GenDoiAuthorList - eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList - dhp-doiboost-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - - -w${workingPath}/ - -aauthors/authors.seq - -xwxml/works/*.seq - -odoi_author_list/ - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml deleted file mode 100644 index 6f629c754..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_activities/oozie_app/workflow.xml +++ /dev/null @@ -1,232 +0,0 @@ - - - - workingPath - the working dir base path - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.java - ${oozieActionShareLibForSpark2} - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx2g - - - oozie.use.system.libpath - true - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_0.tar.gz - -owxml/works/xml_works_0.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_1.tar.gz - -owxml/works/xml_works_1.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_2.tar.gz - -owxml/works/xml_works_2.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_3.tar.gz - -owxml/works/xml_works_3.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_4.tar.gz - -owxml/works/xml_works_4.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_5.tar.gz - -owxml/works/xml_works_5.seq - -oew--- - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_6.tar.gz - -owxml/works/xml_works_6.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_7.tar.gz - -owxml/works/xml_works_7.seq - -oew--- - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_8.tar.gz - -owxml/works/xml_works_8.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_9.tar.gz - -owxml/works/xml_works_9.seq - -oew--- - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_activites_X.tar.gz - -owxml/works/xml_works_X.seq - -oew--- - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml deleted file mode 100644 index 191654378..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/config-default.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx8g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml deleted file mode 100644 index 68d468ab3..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_extract_xml_summaries/oozie_app/workflow.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - workingPath - the working dir base path - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_summaries.tar.gz - -oxml/authors/ - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml deleted file mode 100644 index 191654378..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.launcher.mapreduce.map.java.opts - -Xmx8g - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml deleted file mode 100644 index 8517f35ee..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - workingPath - the working dir base path - - - shell_cmd_0 - wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz - - the shell command that downloads and puts to hdfs orcid summaries - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcid.OrcidDSManager - -w${workingPath}/ - -n${nameNode} - -fORCID_2020_10_summaries.tar.gz - -oauthors/ - - - - - - - \ No newline at end of file From 1265dadc90686ad9a687a8bf7e20fd739d663ae9 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 20 May 2021 19:01:28 +0200 Subject: [PATCH 4/9] workflow aligned with stable_ids --- .../orcid/SparkDownloadOrcidAuthors.java | 2 +- .../orcidnodoi/oaf/PublicationToOaf.java | 4 +- .../oozie_app/workflow.xml | 42 ------------------- .../orcidnodoi/oozie_app/workflow.xml | 31 ++++++++++---- 4 files changed, 27 insertions(+), 52 deletions(-) delete mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 36b4b073d..8cf070213 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -62,7 +62,7 @@ public class SparkDownloadOrcidAuthors { isSparkSessionManaged, spark -> { String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt"); - logger.info("lastUpdate: ", lastUpdate); + logger.info("lastUpdate: {}", lastUpdate); if (StringUtils.isBlank(lastUpdate)) { throw new RuntimeException("last update info not found"); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 5c3236222..ccae4d976 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -174,7 +174,9 @@ public class PublicationToOaf implements Serializable { publication .getExternalReference() .add( - convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types")); + convertExtRef( + extId, classid, classname, ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES)); } }); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml deleted file mode 100644 index becdf0974..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_orcid_no_doi/oozie_app/workflow.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - - inputPath - /data/orcid_activities_2020/no_doi_dataset - path where retrieve the already generated action set - - - outputPath - /data/orcid_activities_2020/test_import_orcid_no_doi - path where to store the action set - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${inputPath}/* - ${outputPath} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 6513ff7e1..365c4d5b4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -1,5 +1,15 @@ + + workingPath + /data/orcid_activities_2020 + path where the collection workflow stores the ORCID data + + + outputPath + path where to store the action set + + spark2GenNoDoiDatasetMaxExecutors 40 @@ -35,10 +45,6 @@ spark2EventLogDir spark 2.* event log dir location - - workingPath - the working dir base path - @@ -83,11 +89,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - -w${workingPath}/ - -n${nameNode} - -ilast_orcid_dataset - -oewno_doi_dataset + --workingPath${workingPath}/ + --hdfsServerUri${nameNode} + --orcidDataFolderlast_orcid_dataset + --outputEnrichedWorksPathno_doi_dataset + + + + + + + ${workingPath}/no_doi_dataset/* + ${outputPath} + From d0945c3c7817d7081b6fb214b6ec7ee227e3a396 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 20 May 2021 19:14:31 +0200 Subject: [PATCH 5/9] added temporary output folder, because of folder access rights are different on beta and prod --- .../dhp/doiboost/orcidnodoi/oozie_app/workflow.xml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 365c4d5b4..68f370a20 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -9,7 +9,10 @@ outputPath path where to store the action set - + + processOutputFolder + temporary path where to store the action set + spark2GenNoDoiDatasetMaxExecutors 40 @@ -66,7 +69,7 @@ - + @@ -92,7 +95,7 @@ --workingPath${workingPath}/ --hdfsServerUri${nameNode} --orcidDataFolderlast_orcid_dataset - --outputEnrichedWorksPathno_doi_dataset + --outputEnrichedWorksPath${processOutputFolder} @@ -100,7 +103,7 @@ - ${workingPath}/no_doi_dataset/* + ${workingPath}/${processOutputFolder}/* ${outputPath} From abdd0ade1fd154cbacedfea928788f600b3ba4a7 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 21 May 2021 12:08:16 +0200 Subject: [PATCH 6/9] added temporary output folder as workflow parameter --- .../eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 68f370a20..f7ac04821 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -7,10 +7,12 @@ outputPath + /data/orcid_activities_2020/no_doi_dataset_prod/ path where to store the action set processOutputFolder + process_no_doi_dataset_prod temporary path where to store the action set From a65667d2175d2b6a83a04462699d74b7aee13305 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 14 Jul 2021 15:07:07 +0200 Subject: [PATCH 7/9] added publication to dataset even if no contributors --- .../SparkGenEnrichedOrcidWorks.java | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 5bcec7224..ca39f99cc 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -4,6 +4,7 @@ package eu.dnetlib.doiboost.orcidnodoi; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; +import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -32,10 +33,7 @@ import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.orcid.AuthorData; -import eu.dnetlib.dhp.schema.orcid.AuthorSummary; -import eu.dnetlib.dhp.schema.orcid.Work; -import eu.dnetlib.dhp.schema.orcid.WorkDetail; +import eu.dnetlib.dhp.schema.orcid.*; import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.util.HDFSUtil; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; @@ -111,6 +109,10 @@ public class SparkGenEnrichedOrcidWorks { Encoders.bean(WorkDetail.class)); logger.info("Works data loaded: " + workDataset.count()); + final LongAccumulator warnNotFoundContributors = spark + .sparkContext() + .longAccumulator("warnNotFoundContributors"); + JavaRDD> enrichedWorksRDD = workDataset .joinWith( authorDataset, @@ -119,7 +121,21 @@ public class SparkGenEnrichedOrcidWorks { (MapFunction, Tuple2>) value -> { WorkDetail w = value._1; AuthorData a = value._2; - AuthorMatcher.match(a, w.getContributors()); + if (w.getContributors() == null + || (w.getContributors() != null && w.getContributors().size() == 0)) { + Contributor c = new Contributor(); + c.setName(a.getName()); + c.setSurname(a.getSurname()); + c.setCreditName(a.getCreditName()); + c.setOid(a.getOid()); + List contributors = Arrays.asList(c); + w.setContributors(contributors); + if (warnNotFoundContributors != null) { + warnNotFoundContributors.add(1); + } + } else { + AuthorMatcher.match(a, w.getContributors()); + } return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w)); }, Encoders.tuple(Encoders.STRING(), Encoders.STRING())) @@ -180,6 +196,7 @@ public class SparkGenEnrichedOrcidWorks { logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString()); + logger.info("warnNotFoundContributors: " + warnNotFoundContributors.value().toString()); logger.info("errorsGeneric: " + errorsGeneric.value().toString()); logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString()); logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); From 66604bb2b4fe045858b1d54ac3fc2df93bbbdeba Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 14 Jul 2021 16:44:51 +0200 Subject: [PATCH 8/9] added absolute path to process folder --- .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 2 +- .../dhp/doiboost/orcidnodoi/oozie_app/workflow.xml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index c342d2e79..1d47808ef 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -188,7 +188,7 @@ public class SparkGenEnrichedOrcidWorks { OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, p)))) .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) .saveAsNewAPIHadoopFile( - workingPath.concat(outputEnrichedWorksPath), + outputEnrichedWorksPath, Text.class, Text.class, SequenceFileOutputFormat.class, diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index f7ac04821..05492d50d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -11,7 +11,7 @@ path where to store the action set - processOutputFolder + processOutputPath process_no_doi_dataset_prod temporary path where to store the action set @@ -71,7 +71,7 @@ - + @@ -97,7 +97,7 @@ --workingPath${workingPath}/ --hdfsServerUri${nameNode} --orcidDataFolderlast_orcid_dataset - --outputEnrichedWorksPath${processOutputFolder} + --outputEnrichedWorksPath${processOutputPath} @@ -105,7 +105,7 @@ - ${workingPath}/${processOutputFolder}/* + ${processOutputPath}/* ${outputPath} From 2dc50c0999738eb3051fb8570f3e3a0b4bb81c7e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 14 Jul 2021 17:02:22 +0200 Subject: [PATCH 9/9] added default value to process path --- .../eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 05492d50d..04ca05af2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -12,7 +12,7 @@ processOutputPath - process_no_doi_dataset_prod + /data/orcid_activities_2020/process_no_doi_dataset_prod temporary path where to store the action set