[ORCID-no-doi] integrating PR#98 D-Net/dnet-hadoop#98

This commit is contained in:
Claudio Atzori 2021-04-01 17:11:03 +02:00
parent ee34cc51c3
commit e686b8de8d
3 changed files with 0 additions and 138 deletions

View File

@ -1,31 +0,0 @@
diff a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java (rejected hunks)
@@ -1,8 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
-
import static com.google.common.base.Preconditions.checkArgument;
import java.text.ParseException;
@@ -10,6 +8,8 @@ import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+
/**
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
@@ -137,7 +137,10 @@ public class Relation extends Oaf {
try {
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
} catch (ParseException e) {
- throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
+ throw new IllegalArgumentException(String
+ .format(
+ "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
+ getValidationDate()));
}
super.mergeFrom(r);

View File

@ -1,30 +0,0 @@
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java (rejected hunks)
@@ -31,7 +32,6 @@ public class SparkDownloadOrcidAuthors {
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
- static String lastUpdate;
public static void main(String[] args) throws Exception {
@@ -54,14 +54,18 @@ public class SparkDownloadOrcidAuthors {
final String token = parser.get("token");
final String lambdaFileName = parser.get("lambdaFileName");
logger.info("lambdaFileName: {}", lambdaFileName);
-
- lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
+ final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
+ String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
+ logger.info("lastUpdate: ", lastUpdate);
+ if (StringUtils.isBlank(lastUpdate)) {
+ throw new RuntimeException("last update info not found");
+ }
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");

View File

@ -1,77 +0,0 @@
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java (rejected hunks)
@@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
- public static final String ORCID = "ORCID";
- public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
+ public static final String DEACTIVATED_NAME = "Given Names Deactivated";
+ public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
private String dateOfCollection = "";
private final LongAccumulator parsedPublications;
@@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null;
this.otherTypeFound = null;
+ this.deactivatedAcc = null;
+ this.titleNotProvidedAcc = null;
+ this.noUrlAcc = null;
this.dateOfCollection = null;
}
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
- put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
+ put(
+ ModelConstants.ORCID,
+ new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
}
};
@@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
}
return null;
}
+ if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
+ if (titleNotProvidedAcc != null) {
+ titleNotProvidedAcc.add(1);
+ }
+ return null;
+ }
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
publication
.setTitle(
@@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
- cf.setValue(ORCID);
+ cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
- KeyValue hb = new KeyValue();
- hb.setValue("Unknown Repository");
- hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
- return hb;
+ return ModelConstants.UNKNOWN_REPOSITORY;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
- q.setClassid(ORCID.toLowerCase());
- q.setClassname(ORCID_PID_TYPE_CLASSNAME);
+ q.setClassid(ModelConstants.ORCID);
+ q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);