diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index fb02a5e00..364c36ee2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -100,6 +100,13 @@ public class ModelConstants { public static final String UNKNOWN = "UNKNOWN"; public static final String NOT_AVAILABLE = "not available"; + public static final String ACTION_SET_SCHEME = "sysimport:actionset"; + + public static final String PROVENANCE_VOCABULARY = "dnet:provenanceActions"; + + public static final Qualifier ACTION_SET_PROVENANCE_QUALIFIER = qualifier( + ACTION_SET_SCHEME, ACTION_SET_SCHEME, PROVENANCE_VOCABULARY, PROVENANCE_VOCABULARY); + public static final Qualifier PUBLICATION_DEFAULT_RESULTTYPE = qualifier( PUBLICATION_RESULTTYPE_CLASSID, PUBLICATION_RESULTTYPE_CLASSID, DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 8c97dfd03..871d5a5c3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -5,6 +5,9 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory +import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import org.json4s.DefaultFormats @@ -49,9 +52,9 @@ object DataciteToOAFTransformation { val ACCESS_MODE_VOCABULARY = "dnet:access_modes" val DOI_CLASS = "doi" - val TITLE_SCHEME = "dnet:dataCite_title" + val SUBJ_CLASS = "keywords" - val SUBJ_SCHEME = "dnet:subject_classification_typologies" + val j_filter:List[String] = { val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString @@ -62,7 +65,7 @@ object DataciteToOAFTransformation { val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) val dataInfo: DataInfo = generateDataInfo("0.9") - val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite") val hostedByMap: Map[String, HostedByMapType] = { val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString @@ -291,7 +294,7 @@ object DataciteToOAFTransformation { return List() - val doi_q = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PID_TYPES, "doi") + val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES,ModelConstants.DNET_PID_TYPES) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) result.setPid(List(pid).asJava) result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) @@ -339,9 +342,9 @@ object DataciteToOAFTransformation { result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { if (t.titleType.isEmpty) { - OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null) + OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) } else { - OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null) + OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null) } }).asJava) @@ -386,7 +389,7 @@ object DataciteToOAFTransformation { result.setSubject(subjects.filter(s => s.subject.nonEmpty) .map(s => - OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null) + OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) ).asJava) @@ -422,28 +425,33 @@ object DataciteToOAFTransformation { JField("rightsUri", JString(rightsUri)) <- rightsList } yield rightsUri - val aRights: Option[Qualifier] = accessRights.map(r => { + val aRights: Option[AccessRight] = accessRights.map(r => { vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) - }).find(q => q != null) + }).find(q => q != null).map(q => { + val a = new AccessRight + a.setClassid(q.getClassid) + a.setClassname(q.getClassname) + a.setSchemeid(q.getSchemeid) + a.setSchemename(q.getSchemename) + a + }) - val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) if (client.isDefined) { val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) -// instance.setAccessright(access_rights_qualifier) - - //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]"> + instance.setAccessright(access_rights_qualifier) + instance.setPid(result.getPid) val license = accessRights .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) if (license.isDefined) instance.setLicense(OafMapperUtils.field(license.get, null)) } - val awardUris:List[String] = for { JObject(fundingReferences) <- json \\ "fundingReferences" JField("awardUri", JString(awardUri)) <- fundingReferences @@ -451,6 +459,9 @@ object DataciteToOAFTransformation { val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null) + result.setId(IdentifierFactory.createIdentifier(result)) + if(result.getId == null) + return List() if (relations!= null && relations.nonEmpty) { List(result):::relations } @@ -464,7 +475,7 @@ object DataciteToOAFTransformation { di.setInferred(false) di.setInvisible(false) di.setTrust(trust) - di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions")) + di.setProvenanceaction(ModelConstants.ACTION_SET_PROVENANCE_QUALIFIER) di } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala index 168ad218a..44b175cb2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -27,7 +27,6 @@ object GenerateDataciteDatasetSpark { val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) - log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}") val spark: SparkSession = SparkSession.builder().config(conf) .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) .master(master) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index 6cec4ea34..8e9e8728e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -148,7 +148,7 @@ object ImportDatacite { try { var start: Long = System.currentTimeMillis while (from < now) { - client = new DataciteAPIImporter(from, 1000, from + delta) + client = new DataciteAPIImporter(from, 100, from + delta) var end: Long = 0 val key: IntWritable = new IntWritable(i) val value: Text = new Text diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml new file mode 100644 index 000000000..3c58ace7b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml @@ -0,0 +1,46 @@ + + + + sourcePath + the working path of Datacite stores + + + outputPath + the path of Datacite ActionSet + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn-cluster + cluster + ExportDataset + eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --targetPath${outputPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json index 34fa3ed99..dea037fd4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json @@ -12,12 +12,6 @@ "paramDescription": "the target mdstore path", "paramRequired": true }, - { - "paramName": "tr", - "paramLongName": "transformationRule", - "paramDescription": "the transformation Rule", - "paramRequired": true - }, { "paramName": "m", "paramLongName": "master", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml index 15378c6c7..3eee2e5a8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -1,41 +1,21 @@ - mdstoreInputPath - the path of the input MDStore - - - - mdstoreOutputPath - the path of the cleaned mdstore + mainPath + the working path of Datacite stores - nativeInputPath - the path of the input MDStore + isLookupUrl + The IS lookUp service endopoint - - skipimport - false - the path of the input MDStore - - - - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - ${wf:conf('resumeFrom') eq 'TransformJob'} - ${wf:conf('resumeFrom') eq 'ExportDataset'} - - - @@ -53,10 +33,9 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - -t${nativeInputPath} - -d${mdstoreInputPath} - -n${nameNode} - -s${skipimport} + --targetPath${mainPath}/datacite_update + --dataciteDumpPath${mainPath}/datacite_dump + --namenode${nameNode} --masteryarn-cluster @@ -81,44 +60,9 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePath${mdstoreInputPath} - --targetPath${mdstoreOutputPath} + --sourcePath${mainPath}/datacite_dump + --targetPath${mainPath}/datacite_oaf --isLookupUrl${isLookupUrl} - -tr${isLookupUrl} - --masteryarn-cluster - - - - - - - - - - - - - - - - - yarn-cluster - cluster - ExportDataset - eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${mdstoreOutputPath} - --targetPath${mdstoreOutputPath}_raw_AS --masteryarn-cluster diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 61e5710fa..3d7b1bf22 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + mdStoreInputId diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala new file mode 100644 index 000000000..a7d404300 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala @@ -0,0 +1,35 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.junit.jupiter.api.extension.ExtendWith +import org.junit.jupiter.api.{BeforeEach, Test} +import org.mockito.junit.jupiter.MockitoExtension + +import scala.io.Source + +@ExtendWith(Array(classOf[MockitoExtension])) +class DataciteToOAFTest extends AbstractVocabularyTest{ + + + @BeforeEach + def setUp() :Unit = { + println("Called Method") + super.setUpVocabulary() + } + + @Test + def testMapping() :Unit = { + val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString + + + + val mapper = new ObjectMapper() + val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies ) + println (mapper.writeValueAsString(res.head)) + + + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json new file mode 100644 index 000000000..b5ac40e25 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json @@ -0,0 +1 @@ +{"relationships": {"client": {"data": {"type": "clients", "id": "gbif.gbif"}}}, "attributes": {"partCount": 0, "contributors": [], "versionCount": 0, "titles": [{"title": "Occurrence Download"}], "descriptions": [{"lang": "eng", "descriptionType": "Abstract", "description": "A dataset containing 2452 species occurrences available in GBIF matching the query: { \"TaxonKey\" : [ \"is Indotyphlops braminus (Daudin, 1803)\" ] } The dataset includes 2452 records from 48 constituent datasets: 8 records from India Biodiversity Portal publication grade dataset. 6 records from UCM Amphibian and Reptile Collection (Arctos). 577 records from iNaturalist Research-grade Observations. 5 records from NCSM Herpetology Collection. 4 records from Vertebrate Zoology Division - Herpetology, Yale Peabody Museum. 4 records from Asian Herpetological Collection - IICT. 11 records from WildNet - Queensland Wildlife Data. 68 records from Australian Museum provider for OZCAM. 33 records from Geographically tagged INSDC sequences. 8 records from LACM Vertebrate Collection. 19 records from Herpetology. 88 records from KUBI Herpetology Collection. 187 records from The reptiles and amphibians collection (RA) of the Mus\u00e9um national d'Histoire Naturelle (MNHN - Paris). 36 records from Queensland Museum provider for OZCAM. 46 records from DAYO: Invasive Alien Reptiles in the Philippines. 33 records from MVZ Herp Collection (Arctos). 22 records from BYU Herpetology Collection. 1 records from Atlas des amphibiens et reptiles de Martinique. 2 records from University of Alberta Museum of Zoology Amphibian and Reptile Collection (UAMZ). 2 records from NMNH Paleobiology Specimen Records. 23 records from South Australian Museum Australia provider for OZCAM. 30 records from Fauna Atlas N.T.. 302 records from Museum of Comparative Zoology, Harvard University. 9 records from Herpetology Collection NRM. 397 records from University of Florida Herpetology. 40 records from Western Australian Museum provider for OZCAM. 1 records from Collection Herpetology SMF. 16 records from TNHC Herpetology Collection. 28 records from International Barcode of Life project (iBOL). 26 records from Base de donn\u00e9es de NOI \u2013 JDD_HISTORIQUE. 24 records from Naturalis Biodiversity Center (NL) - Amphibia and Reptilia. 242 records from Field Museum of Natural History (Zoology) Amphibian and Reptile Collection. 18 records from Australian National Wildlife Collection provider for OZCAM. 1 records from Herpetology. 1 records from Donn\u00e9es naturalistes d'Olivier ESCUDER. 12 records from Reptile Specimens. 1 records from Lund Museum of Zoology (MZLU). 2 records from Donn\u00e9es d'occurrences Esp\u00e8ces issues de l'inventaire des ZNIEFF. 6 records from Museums Victoria provider for OZCAM. 3 records from Questagame weekly feed. 78 records from Northern Territory Museum and Art Gallery provider for OZCAM. 1 records from SysTax - Zoological Collections. 11 records from UMZC Zoological Specimens. 1 records from AUMNH Herpetology Voucher Collection. 1 records from Tissues Specimens. 10 records from Apoyo a las colecciones biol\u00f3gicas de la Facultad de Ciencias de la UNAM: Fase 1 (MZFC_HE). 4 records from HerpMapper. 4 records from ALA species sightings and OzAtlas. Data from some individual datasets included in this download may be licensed under less restrictive terms."}], "referenceCount": 0, "subjects": [{"lang": "eng", "subject": "GBIF"}, {"lang": "eng", "subject": "biodiversity"}, {"lang": "eng", "subject": "species occurrences"}], "container": {}, "state": "findable", "created": "2020-08-23T05:03:37.000Z", "source": null, "metadataVersion": 0, "version": null, "isActive": true, "registered": "2020-08-23T05:03:37.000Z", "contentUrl": null, "geoLocations": [], "updated": "2020-08-23T05:03:37.000Z", "fundingReferences": [], "partOfCount": 0, "viewCount": 0, "versionOfCount": 0, "published": "2020", "dates": [{"date": "2020-08-23", "dateType": "Created"}, {"date": "2020-08-23", "dateType": "Updated"}, {"date": "2020", "dateType": "Issued"}], "relatedIdentifiers": [{"relationType": "References", "relatedIdentifier": "10.15468/rs5upd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/1llmgl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ab3s5x", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/enivwl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ypdvp9", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/fhn7xo", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lxgoyb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/e7susi", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cndomv", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/77rmwd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/2wlj2m", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ubdwdc", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/whdzq3", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lotsye", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cpv8vf", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/pi1mts", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/tekwqq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/4eswn6", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.18165/qmltit", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/7m0fvd", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/wz4rrh", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/eeg0zb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/p5rupv", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/b9o7h4", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/vw3dvj", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/5qt0dm", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lkc3vq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/xrorih", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/inygc6", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lhcdhw", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ythnjq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/u2pzhj", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/gscnac", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/px1sya", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/g5giua", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/cplkwg", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/mw39rb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/ikshke", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/lp1ctu", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/slqqt8", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/giro3a", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/zyqkbl", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/pjmjvn", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/d1vglq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/5fmfwq", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/htjhrb", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/9tsf2l", "relatedIdentifierType": "DOI"}, {"relationType": "References", "relatedIdentifier": "10.15468/jayxmn", "relatedIdentifierType": "DOI"}], "reason": null, "rightsList": [{"rightsUri": "https://creativecommons.org/licenses/by-nc/4.0/legalcode", "rightsIdentifier": "cc-by-nc-4.0", "rightsIdentifierScheme": "SPDX", "schemeUri": "https://spdx.org/licenses/", "rights": "Creative Commons Attribution Non Commercial 4.0 International"}], "schemaVersion": "http://datacite.org/schema/kernel-4", "types": {"citeproc": "dataset", "resourceTypeGeneral": "Dataset", "schemaOrg": "Dataset", "bibtex": "misc", "ris": "DATA"}, "publisher": "The Global Biodiversity Information Facility", "publicationYear": 2020, "doi": "10.15468/dl.g9k8b9", "language": null, "sizes": ["161836"], "url": "https://www.gbif.org/occurrence/download/0044053-200613084148143", "identifiers": [], "citationCount": 0, "formats": ["Darwin Core Archive"], "downloadCount": 0, "creators": [{"nameType": "Organizational", "nameIdentifiers": [], "name": "Occdownload Gbif.Org", "affiliation": []}]}, "type": "dois", "id": "10.15468/dl.g9k8b9", "timestamp": 1598151817} \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index 88fee72b7..76d29206a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -6,7 +6,6 @@ import org.apache.spark.SparkConf import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.types._ import org.slf4j.{Logger, LoggerFactory} -import org.apache.spark.sql.functions._ object SparkImportMagIntoDataset { val datatypedict = Map( @@ -25,11 +24,10 @@ object SparkImportMagIntoDataset { "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), - "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")), + "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), - // ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime'] "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")), "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), @@ -37,6 +35,7 @@ object SparkImportMagIntoDataset { "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), "PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")), "PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")), + "PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")), "PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")), "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")), "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),