From 5b724d9972007696c5dacb90833e0397dec0f59e Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 4 Jun 2021 10:14:22 +0200 Subject: [PATCH] added relations to datacite mapping --- .../DataciteToOAFTransformation.scala | 81 +++++++++++++++++-- .../GenerateDataciteDatasetSpark.scala | 3 +- .../datacite/DataciteToOAFTest.scala | 17 +++- .../sx/graphimport/SparkDataciteToOAF.scala | 31 +++++++ 4 files changed, 122 insertions(+), 10 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 979ab43711..aab2488a33 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -3,10 +3,9 @@ package eu.dnetlib.dhp.actionmanager.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.common.{ModelConstants, ModelSupport} import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} -import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} +import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils @@ -25,6 +24,8 @@ import scala.io.{Codec, Source} case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} +case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {} + case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} @@ -43,6 +44,36 @@ case class HostedByMapType(openaire_id: String, datacite_name: String, official_ object DataciteToOAFTransformation { +val REL_TYPE_VALUE:String = "resultResult" + + val subRelTypeMapping: Map[String,String] = Map( + "References" ->"relationship", + "IsSupplementTo" ->"supplement", + "IsPartOf" ->"part", + "HasPart" ->"part", + "IsVersionOf" ->"version", + "HasVersion" ->"version", + "IsIdenticalTo" ->"relationship", + "IsPreviousVersionOf" ->"version", + "IsContinuedBy" ->"relationship", + "Continues" ->"relationship", + "IsNewVersionOf" ->"version", + "IsSupplementedBy" ->"supplement", + "IsDocumentedBy" ->"relationship", + "IsSourceOf" ->"relationship", + "Cites" ->"citation", + "IsCitedBy" ->"citation", + "IsDerivedFrom" ->"relationship", + "IsVariantFormOf" ->"version", + "IsReferencedBy" ->"relationship", + "IsObsoletedBy" ->"version", + "Reviews" ->"review", + "Documents" ->"relationship", + "IsCompiledBy" ->"relationship", + "Compiles" ->"relationship", + "IsReviewedBy" ->"review" + ) + implicit val codec: Codec = Codec("UTF-8") codec.onMalformedInput(CodingErrorAction.REPLACE) codec.onUnmappableCharacter(CodingErrorAction.REPLACE) @@ -232,6 +263,7 @@ object DataciteToOAFTransformation { * As describe in ticket #6377 * when the result come from figshare we need to remove subject * and set Access rights OPEN. + * * @param r */ def fix_figshare(r: Result): Unit = { @@ -248,6 +280,12 @@ object DataciteToOAFTransformation { } + + def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { + val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') + s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" + } + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { OafMapperUtils.structuredProperty(dt, q, null) } @@ -286,7 +324,7 @@ object DataciteToOAFTransformation { } - def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup): List[Oaf] = { + def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = { if (filter_json(input)) return List() @@ -468,11 +506,44 @@ object DataciteToOAFTransformation { JField("awardUri", JString(awardUri)) <- fundingReferences } yield awardUri - val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + + fix_figshare(result) result.setId(IdentifierFactory.createIdentifier(result)) if (result.getId == null) return List() + + if (exportLinks) { + val rels: List[RelatedIdentifierType] = for { + JObject(relIdentifier) <- json \\ "relatedIdentifiers" + JField("relationType", JString(relationType)) <- relIdentifier + JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier + JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier + } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) + + + relations = relations ::: rels + .filter(r => + subRelTypeMapping.contains(r.relationType) && ( + r.relatedIdentifierType.equalsIgnoreCase("doi") || + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv") ) + ) + .map(r => { + val rel = new Relation + + val subRelType = subRelTypeMapping.get(r.relationType) + rel.setRelType(REL_TYPE_VALUE) + rel.setSubRelType(subRelType.get) + rel.setRelClass(r.relationType) + rel.setSource(result.getId) + rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + rel.setDataInfo(dataInfo) + rel.setTarget(createDNetTargetIdentifier(r.relatedIdentifier, r.relatedIdentifierType, "50|")) + rel + }) + } if (relations != null && relations.nonEmpty) { List(result) ::: relations } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala index 44b175cb22..2cabc78799 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -22,6 +22,7 @@ object GenerateDataciteDatasetSpark { val master = parser.get("master") val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") + val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks")) val isLookupUrl: String = parser.get("isLookupUrl") log.info("isLookupUrl: {}", isLookupUrl) @@ -40,7 +41,7 @@ object GenerateDataciteDatasetSpark { spark.read.load(sourcePath).as[DataciteType] .filter(d => d.isActive) - .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies)) + .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) .filter(d => d != null) .write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala index d8c9d56fcf..0d10c41dca 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala @@ -1,12 +1,15 @@ package eu.dnetlib.dhp.actionmanager.datacite +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.databind.SerializationFeature + import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.Oaf import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.{BeforeEach, Test} import org.mockito.junit.jupiter.MockitoExtension -import org.codehaus.jackson.map.ObjectMapper + import scala.io.Source @ExtendWith(Array(classOf[MockitoExtension])) @@ -25,9 +28,15 @@ class DataciteToOAFTest extends AbstractVocabularyTest{ - val mapper = new ObjectMapper() - val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies ) - println (mapper.defaultPrettyPrintingWriter().writeValueAsString(res.head)) + val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) + val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true ) + + res.foreach(r => { + println (mapper.writeValueAsString(r)) + println("----------------------------") + + }) + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala new file mode 100644 index 0000000000..9e905d806b --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.oa.sx.graphimport + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +object SparkDataciteToOAF { + + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + import spark.implicits._ + + + val sc = spark.sparkContext + + val inputPath = parser.get("inputPath") + + + } + +}