From 616d2ecce2b104a9a3a9fe7d9ed813fbfb40d6db Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 31 Mar 2021 15:45:58 +0200 Subject: [PATCH] splitted workflow collecting datacite into two workflows. Released on beta --- .../dhp/schema/common/ModelConstants.java | 7 ++ .../DataciteToOAFTransformation.scala | 64 ++++++++-------- .../GenerateDataciteDatasetSpark.scala | 1 - .../datacite/ImportDatacite.scala | 2 +- .../actionset/oozie_app/config-default.xml | 23 ++++++ .../datacite/actionset/oozie_app/workflow.xml | 46 +++++++++++ .../datacite/generate_dataset_params.json | 6 -- .../datacite/oozie_app/workflow.xml | 76 +++---------------- .../dhp/transformation/oozie_app/workflow.xml | 2 +- .../datacite/DataciteToOAFTest.scala | 35 +++++++++ .../dhp/actionmanager/datacite/record.json | 1 + .../mag/SparkImportMagIntoDataset.scala | 5 +- 12 files changed, 160 insertions(+), 108 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index eaa8acef5..2ed672c12 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -89,6 +89,13 @@ public class ModelConstants { public static final String UNKNOWN = "UNKNOWN"; public static final String NOT_AVAILABLE = "not available"; + public static final String ACTION_SET_SCHEME = "sysimport:actionset"; + + public static final String PROVENANCE_VOCABULARY = "dnet:provenanceActions"; + + public static final Qualifier ACTION_SET_PROVENANCE_QUALIFIER = qualifier( + ACTION_SET_SCHEME, ACTION_SET_SCHEME, PROVENANCE_VOCABULARY, PROVENANCE_VOCABULARY); + public static final Qualifier PUBLICATION_DEFAULT_RESULTTYPE = qualifier( PUBLICATION_RESULTTYPE_CLASSID, PUBLICATION_RESULTTYPE_CLASSID, DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 1776a4ad6..1f1abba60 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -3,7 +3,9 @@ package eu.dnetlib.dhp.actionmanager.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory +import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import org.json4s.DefaultFormats @@ -45,17 +47,11 @@ object DataciteToOAFTransformation { codec.onMalformedInput(CodingErrorAction.REPLACE) codec.onUnmappableCharacter(CodingErrorAction.REPLACE) - - - private val PID_VOCABULARY = "dnet:pid_types" - val COBJ_VOCABULARY = "dnet:publication_resource" - val RESULT_VOCABULARY = "dnet:result_typologies" - val ACCESS_MODE_VOCABULARY = "dnet:access_modes" val DOI_CLASS = "doi" - val TITLE_SCHEME = "dnet:dataCite_title" + val SUBJ_CLASS = "keywords" - val SUBJ_SCHEME = "dnet:subject_classification_typologies" + val j_filter:List[String] = { val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString @@ -66,7 +62,7 @@ object DataciteToOAFTransformation { val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) val dataInfo: DataInfo = generateDataInfo("0.9") - val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite") val hostedByMap: Map[String, HostedByMapType] = { val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString @@ -174,20 +170,20 @@ object DataciteToOAFTransformation { def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType) + val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) } if (schemaOrg != null && schemaOrg.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg) + val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) } if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral) + val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) } null @@ -295,7 +291,7 @@ object DataciteToOAFTransformation { return List() - val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi") + val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES,ModelConstants.DNET_PID_TYPES) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) result.setPid(List(pid).asJava) result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) @@ -319,7 +315,7 @@ object DataciteToOAFTransformation { a.setSurname(c.familyName.orNull) if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { a.setPid(c.nameIdentifiers.get.map(ni => { - val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null + val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) { OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) } @@ -343,9 +339,9 @@ object DataciteToOAFTransformation { result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { if (t.titleType.isEmpty) { - OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null) + OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) } else { - OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null) + OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null) } }).asJava) @@ -390,7 +386,7 @@ object DataciteToOAFTransformation { result.setSubject(subjects.filter(s => s.subject.nonEmpty) .map(s => - OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null) + OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) ).asJava) @@ -426,28 +422,33 @@ object DataciteToOAFTransformation { JField("rightsUri", JString(rightsUri)) <- rightsList } yield rightsUri - val aRights: Option[Qualifier] = accessRights.map(r => { - vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r) - }).find(q => q != null) + val aRights: Option[AccessRight] = accessRights.map(r => { + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) + }).find(q => q != null).map(q => { + val a = new AccessRight + a.setClassid(q.getClassid) + a.setClassname(q.getClassname) + a.setSchemeid(q.getSchemeid) + a.setSchemename(q.getSchemename) + a + }) - val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY) + val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) if (client.isDefined) { val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) -// instance.setAccessright(access_rights_qualifier) - - //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]"> + instance.setAccessright(access_rights_qualifier) + instance.setPid(result.getPid) val license = accessRights .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) if (license.isDefined) instance.setLicense(OafMapperUtils.field(license.get, null)) } - val awardUris:List[String] = for { JObject(fundingReferences) <- json \\ "fundingReferences" JField("awardUri", JString(awardUri)) <- fundingReferences @@ -455,6 +456,9 @@ object DataciteToOAFTransformation { val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null) + result.setId(IdentifierFactory.createIdentifier(result)) + if(result.getId == null) + return List() if (relations!= null && relations.nonEmpty) { List(result):::relations } @@ -468,7 +472,7 @@ object DataciteToOAFTransformation { di.setInferred(false) di.setInvisible(false) di.setTrust(trust) - di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions")) + di.setProvenanceaction(ModelConstants.ACTION_SET_PROVENANCE_QUALIFIER) di } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala index 168ad218a..44b175cb2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -27,7 +27,6 @@ object GenerateDataciteDatasetSpark { val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) - log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}") val spark: SparkSession = SparkSession.builder().config(conf) .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) .master(master) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index 6cec4ea34..8e9e8728e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -148,7 +148,7 @@ object ImportDatacite { try { var start: Long = System.currentTimeMillis while (from < now) { - client = new DataciteAPIImporter(from, 1000, from + delta) + client = new DataciteAPIImporter(from, 100, from + delta) var end: Long = 0 val key: IntWritable = new IntWritable(i) val value: Text = new Text diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml new file mode 100644 index 000000000..3c58ace7b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml @@ -0,0 +1,46 @@ + + + + sourcePath + the working path of Datacite stores + + + outputPath + the path of Datacite ActionSet + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn-cluster + cluster + ExportDataset + eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --targetPath${outputPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json index 34fa3ed99..dea037fd4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json @@ -12,12 +12,6 @@ "paramDescription": "the target mdstore path", "paramRequired": true }, - { - "paramName": "tr", - "paramLongName": "transformationRule", - "paramDescription": "the transformation Rule", - "paramRequired": true - }, { "paramName": "m", "paramLongName": "master", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml index 15378c6c7..3eee2e5a8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -1,41 +1,21 @@ - mdstoreInputPath - the path of the input MDStore - - - - mdstoreOutputPath - the path of the cleaned mdstore + mainPath + the working path of Datacite stores - nativeInputPath - the path of the input MDStore + isLookupUrl + The IS lookUp service endopoint - - skipimport - false - the path of the input MDStore - - - - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - ${wf:conf('resumeFrom') eq 'TransformJob'} - ${wf:conf('resumeFrom') eq 'ExportDataset'} - - - @@ -53,10 +33,9 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - -t${nativeInputPath} - -d${mdstoreInputPath} - -n${nameNode} - -s${skipimport} + --targetPath${mainPath}/datacite_update + --dataciteDumpPath${mainPath}/datacite_dump + --namenode${nameNode} --masteryarn-cluster @@ -81,44 +60,9 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePath${mdstoreInputPath} - --targetPath${mdstoreOutputPath} + --sourcePath${mainPath}/datacite_dump + --targetPath${mainPath}/datacite_oaf --isLookupUrl${isLookupUrl} - -tr${isLookupUrl} - --masteryarn-cluster - - - - - - - - - - - - - - - - - yarn-cluster - cluster - ExportDataset - eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${mdstoreOutputPath} - --targetPath${mdstoreOutputPath}_raw_AS --masteryarn-cluster diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 61e5710fa..3d7b1bf22 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + mdStoreInputId diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala new file mode 100644 index 000000000..a7d404300 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala @@ -0,0 +1,35 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.junit.jupiter.api.extension.ExtendWith +import org.junit.jupiter.api.{BeforeEach, Test} +import org.mockito.junit.jupiter.MockitoExtension + +import scala.io.Source + +@ExtendWith(Array(classOf[MockitoExtension])) +class DataciteToOAFTest extends AbstractVocabularyTest{ + + + @BeforeEach + def setUp() :Unit = { + println("Called Method") + super.setUpVocabulary() + } + + @Test + def testMapping() :Unit = { + val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString + + + + val mapper = new ObjectMapper() + val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies ) + println (mapper.writeValueAsString(res.head)) + + + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json new file mode 100644 index 000000000..b5ac40e25 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json @@ -0,0 +1 @@ +{"relationships": {"client": {"data": {"type": "clients", "id": "gbif.gbif"}}}, "attributes": {"partCount": 0, "contributors": [], "versionCount": 0, "titles": [{"title": "Occurrence Download"}], "descriptions": [{"lang": "eng", "descriptionType": "Abstract", "description": "A dataset containing 2452 species occurrences available in GBIF matching the query: { \"TaxonKey\" : [ \"is Indotyphlops braminus (Daudin, 1803)\" ] } The dataset includes 2452 records from 48 constituent datasets: 8 records from India Biodiversity Portal publication grade dataset. 6 records from UCM Amphibian and Reptile Collection (Arctos). 577 records from iNaturalist Research-grade Observations. 5 records from NCSM Herpetology Collection. 4 records from Vertebrate Zoology Division - Herpetology, Yale Peabody Museum. 4 records from Asian Herpetological Collection - IICT. 11 records from WildNet - Queensland Wildlife Data. 68 records from Australian Museum provider for OZCAM. 33 records from Geographically tagged INSDC sequences. 8 records from LACM Vertebrate Collection. 19 records from Herpetology. 88 records from KUBI Herpetology Collection. 187 records from The reptiles and amphibians collection (RA) of the Mus\u00e9um national d'Histoire Naturelle (MNHN - Paris). 36 records from Queensland Museum provider for OZCAM. 46 records from DAYO: Invasive Alien Reptiles in the Philippines. 33 records from MVZ Herp Collection (Arctos). 22 records from BYU Herpetology Collection. 1 records from Atlas des amphibiens et reptiles de Martinique. 2 records from University of Alberta Museum of Zoology Amphibian and Reptile Collection (UAMZ). 2 records from NMNH Paleobiology Specimen Records. 23 records from South Australian Museum Australia provider for OZCAM. 30 records from Fauna Atlas N.T.. 302 records from Museum of Comparative Zoology, Harvard University. 9 records from Herpetology Collection NRM. 397 records from University of Florida Herpetology. 40 records from Western Australian Museum provider for OZCAM. 1 records from Collection Herpetology SMF. 16 records from TNHC Herpetology Collection. 28 records from International Barcode of Life project (iBOL). 26 records from Base de donn\u00e9es de NOI \u2013 JDD_HISTORIQUE. 24 records from Naturalis Biodiversity Center (NL) - Amphibia and Reptilia. 242 records from Field Museum of Natural History (Zoology) Amphibian and Reptile Collection. 18 records from Australian National Wildlife Collection provider for OZCAM. 1 records from Herpetology. 1 records from Donn\u00e9es naturalistes d'Olivier ESCUDER. 12 records from Reptile Specimens. 1 records from Lund Museum of Zoology (MZLU). 2 records from Donn\u00e9es d'occurrences Esp\u00e8ces issues de l'inventaire des ZNIEFF. 6 records from Museums Victoria provider for OZCAM. 3 records from Questagame weekly feed. 78 records from Northern Territory Museum and Art Gallery provider for OZCAM. 1 records from SysTax - Zoological Collections. 11 records from UMZC Zoological Specimens. 1 records from AUMNH Herpetology Voucher Collection. 1 records from Tissues Specimens. 10 records from Apoyo a las colecciones biol\u00f3gicas de la Facultad de Ciencias de la UNAM: Fase 1 (MZFC_HE). 4 records from HerpMapper. 4 records from ALA species sightings and OzAtlas. Data from some individual datasets included in this download may be licensed under less restrictive terms."}], "referenceCount": 0, "subjects": [{"lang": "eng", "subject": "GBIF"}, {"lang": "eng", "subject": "biodiversity"}, {"lang": "eng", "subject": "species occurrences"}], "container": {}, "state": "findable", "created": "2020-08-23T05:03:37.000Z", "source": null, "metadataVersion": 0, "version": null, "isActive": true, "registered": "2020-08-23T05:03:37.000Z", "contentUrl": null, "geoLocations": [], "updated": "2020-08-23T05:03:37.000Z", "fundingReferences": [], "partOfCount": 0, "viewCount": 0, "versionOfCount": 0, "published": "2020", "dates": [{"date": "2020-08-23", "dateType": "Created"}, {"date": "2020-08-23", "dateType": "Updated"}, {"date": "2020", "dateType": "Issued"}], "relatedIdentifiers": [{"relationType": "References", "relatedIdentifier": "10.15468/rs5upd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/1llmgl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ab3s5x", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/enivwl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ypdvp9", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/fhn7xo", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lxgoyb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/e7susi", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cndomv", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/77rmwd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/2wlj2m", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ubdwdc", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/whdzq3", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lotsye", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cpv8vf", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/pi1mts", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/tekwqq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/4eswn6", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.18165/qmltit", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/7m0fvd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/wz4rrh", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/eeg0zb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/p5rupv", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/b9o7h4", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/vw3dvj", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/5qt0dm", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lkc3vq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/xrorih", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/inygc6", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lhcdhw", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ythnjq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/u2pzhj", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/gscnac", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/px1sya", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/g5giua", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cplkwg", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/mw39rb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ikshke", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lp1ctu", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/slqqt8", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/giro3a", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/zyqkbl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/pjmjvn", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/d1vglq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/5fmfwq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/htjhrb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/9tsf2l", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/jayxmn", "relatedIdentifierType": "DOI"}], "reason": null, "rightsList": [{"rightsUri": "https://creativecommons.org/licenses/by-nc/4.0/legalcode", "rightsIdentifier": "cc-by-nc-4.0", "rightsIdentifierScheme": "SPDX", "schemeUri": "https://spdx.org/licenses/", "rights": "Creative Commons Attribution Non Commercial 4.0 International"}], "schemaVersion": "http://datacite.org/schema/kernel-4", "types": {"citeproc": "dataset", "resourceTypeGeneral": "Dataset", "schemaOrg": "Dataset", "bibtex": "misc", "ris": "DATA"}, "publisher": "The Global Biodiversity Information Facility", "publicationYear": 2020, "doi": "10.15468/dl.g9k8b9", "language": null, "sizes": ["161836"], "url": "https://www.gbif.org/occurrence/download/0044053-200613084148143", "identifiers": [], "citationCount": 0, "formats": ["Darwin Core Archive"], "downloadCount": 0, "creators": [{"nameType": "Organizational", "nameIdentifiers": [], "name": "Occdownload Gbif.Org", "affiliation": []}]}, "type": "dois", "id": "10.15468/dl.g9k8b9", "timestamp": 1598151817} \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index 88fee72b7..76d29206a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -6,7 +6,6 @@ import org.apache.spark.SparkConf import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.types._ import org.slf4j.{Logger, LoggerFactory} -import org.apache.spark.sql.functions._ object SparkImportMagIntoDataset { val datatypedict = Map( @@ -25,11 +24,10 @@ object SparkImportMagIntoDataset { "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), - "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")), + "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), - // ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime'] "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")), "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), @@ -37,6 +35,7 @@ object SparkImportMagIntoDataset { "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), "PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")), "PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")), + "PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")), "PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")), "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")), "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),