public static CleaningRuleMap create(VocabularyGroup vocabularies) {
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
+ mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> {
final Country c = (Country) o;
if (StringUtils.isBlank(c.getSchemeid())) {
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java
index e53f4ca30..68623dd55 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java
@@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
+import java.util.stream.Collectors;
import javax.xml.crypto.Data;
@@ -127,6 +128,13 @@ public class MergeGraphTableSparkJob {
}
}, Encoders.bean(p_clazz))
.filter((FilterFunction) Objects::nonNull)
+ .filter((FilterFunction
) o -> {
+ HashSet collectedFromNames = Optional
+ .ofNullable(o.getCollectedfrom())
+ .map(c -> c.stream().map(KeyValue::getValue).collect(Collectors.toCollection(HashSet::new)))
+ .orElse(new HashSet());
+ return !collectedFromNames.contains("Datacite");
+ })
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
index cccf15398..3d75c426d 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@@ -1,36 +1,11 @@
package eu.dnetlib.dhp.oa.graph.raw;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.keyValue;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.oaiIProvenance;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;
+import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
@@ -38,27 +13,12 @@ import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
+import com.google.common.collect.Lists;
+
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.Context;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.Dataset;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.GeoLocation;
-import eu.dnetlib.dhp.schema.oaf.Instance;
-import eu.dnetlib.dhp.schema.oaf.Journal;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.oaf.Result;
-import eu.dnetlib.dhp.schema.oaf.Software;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public abstract class AbstractMdRecordToOafMapper {
@@ -66,6 +26,8 @@ public abstract class AbstractMdRecordToOafMapper {
private final boolean invisible;
+ private final boolean shouldHashId;
+
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@@ -89,12 +51,11 @@ public abstract class AbstractMdRecordToOafMapper {
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
}
- protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
- "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
-
- protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
+ protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible,
+ final boolean shouldHashId) {
this.vocs = vocs;
this.invisible = invisible;
+ this.shouldHashId = shouldHashId;
}
public List processMdRecord(final String xml) {
@@ -137,10 +98,10 @@ public abstract class AbstractMdRecordToOafMapper {
}
protected String getResultType(final Document doc, final List instances) {
- final String type = doc.valueOf("//dr:CobjCategory/@type");
+ String type = doc.valueOf("//dr:CobjCategory/@type");
if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
- final String instanceType = instances
+ String instanceType = instances
.stream()
.map(i -> i.getInstancetype().getClassid())
.findFirst()
@@ -178,20 +139,34 @@ public abstract class AbstractMdRecordToOafMapper {
final DataInfo info,
final long lastUpdateTimestamp) {
- final List oafs = new ArrayList<>();
+ final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
+ final String id = IdentifierFactory.createIdentifier(entity, shouldHashId);
+ if (!id.equals(entity.getId())) {
+ entity.getOriginalId().add(entity.getId());
+ entity.setId(id);
+ }
+ final List oafs = Lists.newArrayList(entity);
+
+ if (!oafs.isEmpty()) {
+ oafs.addAll(addProjectRels(doc, entity));
+ oafs.addAll(addOtherResultRels(doc, entity));
+ }
+
+ return oafs;
+ }
+
+ private OafEntity createEntity(Document doc, String type, List instances, KeyValue collectedFrom,
+ DataInfo info, long lastUpdateTimestamp) {
switch (type.toLowerCase()) {
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
- p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
p.setJournal(prepareJournal(doc, info));
- oafs.add(p);
- break;
+ return p;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
- d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info));
@@ -199,48 +174,34 @@ public abstract class AbstractMdRecordToOafMapper {
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
- oafs.add(d);
- break;
+ return d;
case "software":
final Software s = new Software();
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
- s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
- oafs.add(s);
- break;
+ return s;
case "":
case "otherresearchproducts":
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
- o.setResulttype(ORP_DEFAULT_RESULTTYPE);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info));
- oafs.add(o);
- break;
+ return o;
}
-
- if (!oafs.isEmpty()) {
- oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
- oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
- }
-
- return oafs;
}
private List addProjectRels(
final Document doc,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp) {
+ final OafEntity entity) {
final List res = new ArrayList<>();
- final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+ final String docId = entity.getId();
for (final Object o : doc.selectNodes("//oaf:projectid")) {
@@ -253,72 +214,51 @@ public abstract class AbstractMdRecordToOafMapper {
res
.add(
- getRelationWithValidationDate(
- docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info,
- lastUpdateTimestamp, validationdDate));
+ getRelation(
+ docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate));
res
.add(
- getRelationWithValidationDate(
- projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info,
- lastUpdateTimestamp, validationdDate));
+ getRelation(
+ projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate));
}
}
return res;
}
- protected Relation getRelationWithValidationDate(final String source,
- final String target,
- final String relType,
- final String subRelType,
- final String relClass,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp,
- final String validationDate) {
-
- final Relation r = getRelation(
- source, target, relType, subRelType, relClass, collectedFrom, info, lastUpdateTimestamp);
- r.setValidated(StringUtils.isNotBlank(validationDate));
- r.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
-
- if (StringUtils.isNotBlank(validationDate)) {
- r.setValidated(true);
- r.setValidationDate(validationDate);
- r.getDataInfo().setTrust(DEFAULT_TRUST_FOR_VALIDATED_RELS);
- } else {
- r.setValidated(false);
- r.setValidationDate(null);
- }
-
- return r;
- }
-
protected Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp) {
+ final OafEntity entity) {
+ return getRelation(source, target, relType, subRelType, relClass, entity, null);
+ }
+
+ protected Relation getRelation(final String source,
+ final String target,
+ final String relType,
+ final String subRelType,
+ final String relClass,
+ final OafEntity entity,
+ final String validationDate) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
- rel.setCollectedfrom(Arrays.asList(collectedFrom));
- rel.setDataInfo(info);
- rel.setLastupdatetimestamp(lastUpdateTimestamp);
+ rel.setCollectedfrom(entity.getCollectedfrom());
+ rel.setDataInfo(entity.getDataInfo());
+ rel.setLastupdatetimestamp(entity.getLastupdatetimestamp());
+ rel.setValidated(StringUtils.isNotBlank(validationDate));
+ rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
return rel;
}
protected abstract List addOtherResultRels(
final Document doc,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp);
+ final OafEntity entity);
private void populateResultFields(
final Result r,
@@ -330,11 +270,11 @@ public abstract class AbstractMdRecordToOafMapper {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
- r.setOriginalId(Arrays.asList(findOriginalId(doc)));
+ r.setOriginalId(Lists.newArrayList(findOriginalId(doc)));
r.setCollectedfrom(Arrays.asList(collectedFrom));
- r.setPid(prepareResultPids(doc, info));
- r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection"));
- r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation"));
+ r.setPid(IdentifierFactory.getPids(prepareResultPids(doc, info), collectedFrom));
+ r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()"));
+ r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));
@@ -357,7 +297,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(instances);
- r.setBestaccessright(getBestAccessRights(instances));
+ r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
}
protected abstract List prepareResultPids(Document doc, DataInfo info);
@@ -442,38 +382,6 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info);
- public static Qualifier createBestAccessRights(final List instanceList) {
- return getBestAccessRights(instanceList);
- }
-
- protected static Qualifier getBestAccessRights(final List instanceList) {
- if (instanceList != null) {
- final Optional min = instanceList
- .stream()
- .map(i -> i.getAccessright())
- .min(new LicenseComparator());
-
- final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
-
- if (StringUtils.isBlank(rights.getClassid())) {
- rights.setClassid(UNKNOWN);
- }
- if (StringUtils.isBlank(rights.getClassname())
- || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
- rights.setClassname(NOT_AVAILABLE);
- }
- if (StringUtils.isBlank(rights.getSchemeid())) {
- rights.setSchemeid(DNET_ACCESS_MODES);
- }
- if (StringUtils.isBlank(rights.getSchemename())) {
- rights.setSchemename(DNET_ACCESS_MODES);
- }
-
- return rights;
- }
- return null;
- }
-
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {
@@ -505,6 +413,20 @@ public abstract class AbstractMdRecordToOafMapper {
}
+ protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) {
+ Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId);
+ AccessRight accessRight = new AccessRight();
+ accessRight.setClassid(qualifier.getClassid());
+ accessRight.setClassname(qualifier.getClassname());
+ accessRight.setSchemeid(qualifier.getSchemeid());
+ accessRight.setSchemename(qualifier.getSchemename());
+
+ // TODO set the OAStatus
+ // accessRight.setOaStatus(...);
+
+ return accessRight;
+ }
+
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
}
@@ -636,4 +558,5 @@ public abstract class AbstractMdRecordToOafMapper {
}
return res;
}
+
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
index cfd190670..40020427a 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@@ -64,13 +64,19 @@ public class GenerateEntitiesApplication {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
+ final boolean shouldHashId = Optional
+ .ofNullable(parser.get("shouldHashId"))
+ .map(Boolean::valueOf)
+ .orElse(true);
+ log.info("shouldHashId: {}", shouldHashId);
+
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
- generateEntities(spark, vocs, sourcePaths, targetPath);
+ generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId);
});
}
@@ -78,7 +84,8 @@ public class GenerateEntitiesApplication {
final SparkSession spark,
final VocabularyGroup vocs,
final String sourcePaths,
- final String targetPath) {
+ final String targetPath,
+ final boolean shouldHashId) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final List existingSourcePaths = Arrays
@@ -97,15 +104,12 @@ public class GenerateEntitiesApplication {
sc
.sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
- .map(k -> convertToListOaf(k._1(), k._2(), vocs))
+ .map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs))
.filter(Objects::nonNull)
.flatMap(list -> list.iterator()));
}
inputRdd
- .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
- .reduceByKey((o1, o2) -> OafMapperUtils.merge(o1, o2))
- .map(Tuple2::_2)
.map(
oaf -> oaf.getClass().getSimpleName().toLowerCase()
+ "|"
@@ -116,20 +120,21 @@ public class GenerateEntitiesApplication {
private static List convertToListOaf(
final String id,
final String s,
+ final boolean shouldHashId,
final VocabularyGroup vocs) {
final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) {
case "oaf-store-cleaned":
case "oaf-store-claim":
- return new OafToOafMapper(vocs, false).processMdRecord(s);
+ return new OafToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "odf-store-cleaned":
case "odf-store-claim":
- return new OdfToOafMapper(vocs, false).processMdRecord(s);
+ return new OdfToOafMapper(vocs, false, shouldHashId).processMdRecord(s);
case "oaf-store-intersection":
- return new OafToOafMapper(vocs, true).processMdRecord(s);
+ return new OafToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "odf-store-intersection":
- return new OdfToOafMapper(vocs, true).processMdRecord(s);
+ return new OdfToOafMapper(vocs, true, shouldHashId).processMdRecord(s);
case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization":
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
index 4d7de6f7f..1776689bd 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@@ -23,7 +23,15 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.io.Closeable;
import java.io.IOException;
@@ -491,44 +499,48 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return Arrays.asList(r);
} else {
+ final String validationDate = rs.getString("curation_date");
+
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
final Relation r1 = new Relation();
final Relation r2 = new Relation();
- if (rs.getString(SOURCE_TYPE).equals("project")) {
- r1.setCollectedfrom(collectedFrom);
- r1.setRelType(RESULT_PROJECT);
- r1.setSubRelType(OUTCOME);
- r1.setRelClass(PRODUCES);
-
- r2.setCollectedfrom(collectedFrom);
- r2.setRelType(RESULT_PROJECT);
- r2.setSubRelType(OUTCOME);
- r2.setRelClass(IS_PRODUCED_BY);
- } else {
- r1.setCollectedfrom(collectedFrom);
- r1.setRelType(RESULT_RESULT);
- r1.setSubRelType(RELATIONSHIP);
- r1.setRelClass(IS_RELATED_TO);
-
- r2.setCollectedfrom(collectedFrom);
- r2.setRelType(RESULT_RESULT);
- r2.setSubRelType(RELATIONSHIP);
- r2.setRelClass(IS_RELATED_TO);
- }
-
+ r1.setValidated(true);
+ r1.setValidationDate(validationDate);
+ r1.setCollectedfrom(collectedFrom);
r1.setSource(sourceId);
r1.setTarget(targetId);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
+ r2.setValidationDate(validationDate);
+ r2.setValidated(true);
+ r2.setCollectedfrom(collectedFrom);
r2.setSource(targetId);
r2.setTarget(sourceId);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
+ if (rs.getString(SOURCE_TYPE).equals("project")) {
+ r1.setRelType(RESULT_PROJECT);
+ r1.setSubRelType(OUTCOME);
+ r1.setRelClass(PRODUCES);
+
+ r2.setRelType(RESULT_PROJECT);
+ r2.setSubRelType(OUTCOME);
+ r2.setRelClass(IS_PRODUCED_BY);
+ } else {
+ r1.setRelType(RESULT_RESULT);
+ r1.setSubRelType(RELATIONSHIP);
+ r1.setRelClass(IS_RELATED_TO);
+
+ r2.setRelType(RESULT_RESULT);
+ r2.setSubRelType(RELATIONSHIP);
+ r2.setRelClass(IS_RELATED_TO);
+ }
+
return Arrays.asList(r1, r2);
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
index e62bc0790..22bd6718a 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@@ -2,12 +2,12 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
@@ -19,20 +19,14 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.GeoLocation;
-import eu.dnetlib.dhp.schema.oaf.Instance;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
- public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
- super(vocs, invisible);
+ public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
+ super(vocs, invisible, shouldHashId);
}
@Override
@@ -93,7 +87,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override
protected List> prepareDescriptions(final Document doc, final DataInfo info) {
- return prepareListFields(doc, "//dc:description", info);
+ return prepareListFields(doc, "//dc:description", info)
+ .stream()
+ .map(d -> {
+ d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
+ return d;
+ })
+ .collect(Collectors.toList());
}
@Override
@@ -128,10 +128,21 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
+
+ final List alternateIdentifier = prepareResultPids(doc, info);
+ final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
+
+ final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
+
+ instance
+ .setAlternateIdentifier(
+ alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
+ instance.setPid(pid);
+
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance
- .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
+ .setAccessright(prepareAccessRight(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance
@@ -257,11 +268,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override
protected List addOtherResultRels(
final Document doc,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp) {
- final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+ final OafEntity entity) {
+ final String docId = entity.getId();
final List res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
@@ -275,13 +284,11 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
res
.add(
getRelation(
- docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
- lastUpdateTimestamp));
+ docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
res
.add(
getRelation(
- otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
- lastUpdateTimestamp));
+ otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
}
}
return res;
@@ -295,6 +302,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override
protected List prepareResultPids(final Document doc, final DataInfo info) {
return prepareListStructPropsWithValidQualifier(
- doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
+ doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
+ .stream()
+ .map(CleaningFunctions::normalizePidValue)
+ .collect(Collectors.toList());
}
+
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
index 6d2e28ba8..d4997cd2b 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@@ -2,40 +2,27 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Node;
-import com.google.common.collect.Lists;
-
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.GeoLocation;
-import eu.dnetlib.dhp.schema.oaf.Instance;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
- public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
- super(vocs, invisible);
+ public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) {
+ super(vocs, invisible, shouldHashId);
}
@Override
@@ -51,31 +38,34 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final Node n = (Node) o;
final Author author = new Author();
final String fullname = n.valueOf("./datacite:creatorName");
- author.setFullname(fullname);
-
- final PacePerson pp = new PacePerson(fullname, false);
final String name = n.valueOf("./datacite:givenName");
- if (StringUtils.isBlank(name) & pp.isAccurate()) {
- author.setName(pp.getNormalisedFirstName());
- } else {
- author.setName(name);
- }
-
final String surname = n.valueOf("./datacite:familyName");
- if (StringUtils.isBlank(surname) & pp.isAccurate()) {
- author.setSurname(pp.getNormalisedSurname());
- } else {
- author.setSurname(surname);
- }
+ if (StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)) {
+ author.setFullname(fullname);
- if (StringUtils.isBlank(author.getFullname())) {
- author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
- }
+ final PacePerson pp = new PacePerson(fullname, false);
- author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info));
- author.setPid(preparePids(n, info));
- author.setRank(pos++);
- res.add(author);
+ if (StringUtils.isBlank(name) & pp.isAccurate()) {
+ author.setName(pp.getNormalisedFirstName());
+ } else {
+ author.setName(name);
+ }
+
+ if (StringUtils.isBlank(surname) & pp.isAccurate()) {
+ author.setSurname(pp.getNormalisedSurname());
+ } else {
+ author.setSurname(surname);
+ }
+
+ if (StringUtils.isBlank(author.getFullname())) {
+ author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
+ }
+
+ author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info));
+ author.setPid(preparePids(n, info));
+ author.setRank(pos++);
+ res.add(author);
+ }
}
return res;
}
@@ -114,10 +104,21 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
+
+ final List alternateIdentifier = prepareResultPids(doc, info);
+ final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
+
+ final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
+
+ instance
+ .setAlternateIdentifier(
+ alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
+ instance.setPid(pid);
+
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance
- .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
+ .setAccessright(prepareAccessRight(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
@@ -169,13 +170,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
res
.add(
structuredProperty(
- ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
+ ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATACITE_DATE, DNET_DATACITE_DATE,
info));
} else {
res
.add(
structuredProperty(
- ((Node) o).getText(), dateType, dateType, DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
+ ((Node) o).getText(), dateType, dateType, DNET_DATACITE_DATE, DNET_DATACITE_DATE,
info));
}
}
@@ -320,11 +321,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override
protected List addOtherResultRels(
final Document doc,
- final KeyValue collectedFrom,
- final DataInfo info,
- final long lastUpdateTimestamp) {
+ final OafEntity entity) {
- final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+ final String docId = entity.getId();
final List res = new ArrayList<>();
@@ -336,30 +335,26 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final String otherId = createOpenaireId(50, originalId, false);
final String type = ((Node) o).valueOf("@relationType");
- if (type.equalsIgnoreCase("IsSupplementTo")) {
+ if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) {
res
.add(
getRelation(
- docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info,
- lastUpdateTimestamp));
+ docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity));
res
.add(
getRelation(
- otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info,
- lastUpdateTimestamp));
- } else if (type.equalsIgnoreCase("IsPartOf")) {
-
+ otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity));
+ } else if (type.equalsIgnoreCase(IS_PART_OF)) {
res
.add(
getRelation(
- docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info,
- lastUpdateTimestamp));
+ docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity));
res
.add(
getRelation(
- otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info,
- lastUpdateTimestamp));
+ otherId, docId, RESULT_RESULT, PART, HAS_PARTS, entity));
} else {
+ // TODO catch more semantics
}
}
}
@@ -390,7 +385,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
doc,
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
"@alternateIdentifierType", DNET_PID_TYPES, info));
- return Lists.newArrayList(res);
+
+ return res
+ .stream()
+ .map(CleaningFunctions::normalizePidValue)
+ .collect(Collectors.toList());
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java
index f7579c0a0..a0ce4f5a6 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java
@@ -11,7 +11,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
-import org.codehaus.jackson.map.ObjectMapper;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
index 992d8c40e..dc0529012 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@@ -50,36 +50,12 @@