From e686b8de8deca03b2645cacdcb3e8724c136ba8c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 1 Apr 2021 17:11:03 +0200 Subject: [PATCH] [ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 --- .../dnetlib/dhp/schema/oaf/Relation.java.rej | 31 -------- .../orcid/SparkDownloadOrcidAuthors.java.rej | 30 -------- .../orcidnodoi/oaf/PublicationToOaf.java.rej | 77 ------------------- 3 files changed, 138 deletions(-) delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java.rej delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java.rej delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java.rej diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java.rej b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java.rej deleted file mode 100644 index 7ce658877..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java.rej +++ /dev/null @@ -1,31 +0,0 @@ -diff a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java (rejected hunks) -@@ -1,8 +1,6 @@ - - package eu.dnetlib.dhp.schema.oaf; - --import eu.dnetlib.dhp.schema.common.ModelSupport; -- - import static com.google.common.base.Preconditions.checkArgument; - - import java.text.ParseException; -@@ -10,6 +8,8 @@ import java.util.*; - import java.util.stream.Collectors; - import java.util.stream.Stream; - -+import eu.dnetlib.dhp.schema.common.ModelSupport; -+ - /** - * Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to - * graph node identifiers and it is further characterised by the semantic of the link through the fields relType, -@@ -137,7 +137,10 @@ public class Relation extends Oaf { - try { - setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate())); - } catch (ParseException e) { -- throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate())); -+ throw new IllegalArgumentException(String -+ .format( -+ "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), -+ getValidationDate())); - } - - super.mergeFrom(r); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java.rej b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java.rej deleted file mode 100644 index fc22d8a7a..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java.rej +++ /dev/null @@ -1,30 +0,0 @@ -diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java (rejected hunks) -@@ -31,7 +32,6 @@ public class SparkDownloadOrcidAuthors { - - static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class); - static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; -- static String lastUpdate; - - public static void main(String[] args) throws Exception { - -@@ -54,14 +54,18 @@ public class SparkDownloadOrcidAuthors { - final String token = parser.get("token"); - final String lambdaFileName = parser.get("lambdaFileName"); - logger.info("lambdaFileName: {}", lambdaFileName); -- -- lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt")); -+ final String hdfsServerUri = parser.get("hdfsServerUri"); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { -+ String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt"); -+ logger.info("lastUpdate: ", lastUpdate); -+ if (StringUtils.isBlank(lastUpdate)) { -+ throw new RuntimeException("last update info not found"); -+ } - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java.rej b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java.rej deleted file mode 100644 index 76b63a93d..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java.rej +++ /dev/null @@ -1,77 +0,0 @@ -diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java (rejected hunks) -@@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable { - - static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); - -- public static final String ORCID = "ORCID"; -- public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID"; - public final static String orcidPREFIX = "orcid_______"; - public static final String OPENAIRE_PREFIX = "openaire____"; - public static final String SEPARATOR = "::"; -+ public static final String DEACTIVATED_NAME = "Given Names Deactivated"; -+ public static final String DEACTIVATED_SURNAME = "Family Name Deactivated"; - - private String dateOfCollection = ""; - private final LongAccumulator parsedPublications; -@@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable { - this.errorsNotFoundAuthors = null; - this.errorsInvalidType = null; - this.otherTypeFound = null; -+ this.deactivatedAcc = null; -+ this.titleNotProvidedAcc = null; -+ this.noUrlAcc = null; - this.dateOfCollection = null; - } - - private static Map> datasources = new HashMap>() { - - { -- put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); -+ put( -+ ModelConstants.ORCID, -+ new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid")); - - } - }; -@@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable { - } - return null; - } -+ if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) { -+ if (titleNotProvidedAcc != null) { -+ titleNotProvidedAcc.add(1); -+ } -+ return null; -+ } - Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - publication - .setTitle( -@@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable { - - private KeyValue createCollectedFrom() { - KeyValue cf = new KeyValue(); -- cf.setValue(ORCID); -+ cf.setValue(ModelConstants.ORCID.toUpperCase()); - cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); - return cf; - } - - private KeyValue createHostedBy() { -- KeyValue hb = new KeyValue(); -- hb.setValue("Unknown Repository"); -- hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); -- return hb; -+ return ModelConstants.UNKNOWN_REPOSITORY; - } - - private StructuredProperty mapAuthorId(String orcidId) { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(orcidId); - final Qualifier q = new Qualifier(); -- q.setClassid(ORCID.toLowerCase()); -- q.setClassname(ORCID_PID_TYPE_CLASSNAME); -+ q.setClassid(ModelConstants.ORCID); -+ q.setClassname(ModelConstants.ORCID_CLASSNAME); - q.setSchemeid(ModelConstants.DNET_PID_TYPES); - q.setSchemename(ModelConstants.DNET_PID_TYPES); - sp.setQualifier(q);