From 99cf3a8ea4980de25f7030acc398ac751ac165ea Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 28 Jan 2021 16:34:46 +0100 Subject: [PATCH] Merged Datacite transfrom into this branch --- .../dhp/schema/scholexplorer/OafUtils.scala | 4 +- dhp-workflows/dhp-aggregation/pom.xml | 36 +- .../datacite/AbstractRestClient.scala | 73 ++ .../datacite/DataciteAPIImporter.scala | 25 + .../DataciteToOAFTransformation.scala | 475 ++++++++ .../datacite/ExportActionSetJobNode.scala | 41 + .../GenerateDataciteDatasetSpark.scala | 48 + .../datacite/ImportDatacite.scala | 168 +++ .../actionmanager/datacite/datacite_filter | 28 + .../datacite/exportDataset_parameters.json | 21 + .../datacite/generate_dataset_params.json | 33 + .../actionmanager/datacite/hostedBy_map.json | 1032 +++++++++++++++++ .../datacite/import_from_api.json | 27 + .../datacite/oozie_app/config-default.xml | 18 + .../datacite/oozie_app/workflow.xml | 103 ++ .../doiboost/DoiBoostMappingUtil.scala | 4 +- .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 2 +- 17 files changed, 2132 insertions(+), 6 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala index 27eec77fa..526d65782 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala @@ -15,11 +15,11 @@ object OafUtils { } - def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { + def generateDataInfo(trust: String = "0.9", invisible: Boolean = false): DataInfo = { val di = new DataInfo di.setDeletedbyinference(false) di.setInferred(false) - di.setInvisible(false) + di.setInvisible(invisible) di.setTrust(trust) di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) di diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index cf0fa0efe..0445e0e1b 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -7,10 +7,44 @@ 1.2.4-SNAPSHOT dhp-aggregation - + + + + net.alchim31.maven + scala-maven-plugin + ${net.alchim31.maven.version} + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + org.apache.httpcomponents + httpclient + + org.apache.spark spark-core_2.11 diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala new file mode 100644 index 000000000..852147ccd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -0,0 +1,73 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.apache.commons.io.IOUtils +import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest} +import org.apache.http.entity.StringEntity +import org.apache.http.impl.client.HttpClients + +import java.io.IOException + +abstract class AbstractRestClient extends Iterator[String]{ + + var buffer: List[String] = List() + var current_index:Int = 0 + + var scroll_value: Option[String] = None + + var complete:Boolean = false + + + def extractInfo(input: String): Unit + + protected def getBufferData(): Unit + + + def doHTTPGETRequest(url:String): String = { + val httpGet = new HttpGet(url) + doHTTPRequest(httpGet) + + } + + def doHTTPPOSTRequest(url:String, json:String): String = { + val httpPost = new HttpPost(url) + if (json != null) { + val entity = new StringEntity(json) + httpPost.setEntity(entity) + httpPost.setHeader("Accept", "application/json") + httpPost.setHeader("Content-type", "application/json") + } + doHTTPRequest(httpPost) + } + + def hasNext: Boolean = { + buffer.nonEmpty && current_index < buffer.size + } + + + override def next(): String = { + val next_item:String = buffer(current_index) + current_index = current_index + 1 + if (current_index == buffer.size) + getBufferData() + next_item + } + + + private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ + val client = HttpClients.createDefault + try { + val response = client.execute(r) + IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + throw new RuntimeException("Error on executing request ", e) + } finally try client.close() + catch { + case e: IOException => + throw new RuntimeException("Unable to close client ", e) + } + } + + getBufferData() + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala new file mode 100644 index 000000000..c2ad6855c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala @@ -0,0 +1,25 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.json4s.{DefaultFormats, JValue} +import org.json4s.jackson.JsonMethods.{compact, parse, render} + +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10) extends AbstractRestClient { + + override def extractInfo(input: String): Unit = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s))) + val next_url = (json \ "links" \ "next").extractOrElse[String](null) + scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None + if (scroll_value.isEmpty) + complete = true + current_index = 0 + } + + override def getBufferData(): Unit = { + if (!complete) { + val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20*]") + extractInfo(response) + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala new file mode 100644 index 000000000..9418e71da --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -0,0 +1,475 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import java.nio.charset.CodingErrorAction +import java.time.LocalDate +import java.time.format.DateTimeFormatter +import java.util.Locale +import java.util.regex.Pattern +import scala.collection.JavaConverters._ +import scala.io.{Codec, Source} + + + +case class DataciteType(doi:String,timestamp:Long,isActive:Boolean, json:String ){} + +case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} + +case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} + +case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} + +case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {} + +case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} + +case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {} + +case class DateType(date: Option[String], dateType: Option[String]) {} + +case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {} + +object DataciteToOAFTransformation { + + implicit val codec: Codec = Codec("UTF-8") + codec.onMalformedInput(CodingErrorAction.REPLACE) + codec.onUnmappableCharacter(CodingErrorAction.REPLACE) + + private val PID_VOCABULARY = "dnet:pid_types" + val COBJ_VOCABULARY = "dnet:publication_resource" + val RESULT_VOCABULARY = "dnet:result_typologies" + val ACCESS_MODE_VOCABULARY = "dnet:access_modes" + val DOI_CLASS = "doi" + + val TITLE_SCHEME = "dnet:dataCite_title" + val SUBJ_CLASS = "keywords" + val SUBJ_SCHEME = "dnet:subject_classification_typologies" + + val j_filter:List[String] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString + s.lines.toList + } + + val mapper = new ObjectMapper() + val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) + + val dataInfo: DataInfo = generateDataInfo("0.9") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::datacite", "Datacite") + + val hostedByMap: Map[String, HostedByMapType] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(s) + json.extract[Map[String, HostedByMapType]] + } + + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) + val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + + val funder_regex:List[(Pattern, String)] = List( + (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda__h2020::"), + (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda_______::") + + ) + + val Date_regex: List[Pattern] = List( + //Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + //M-D-Y + Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + //D-M-Y + Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ) + + + def filter_json(json:String):Boolean = { + j_filter.exists(f => json.contains(f)) + } + + def toActionSet(item:Oaf) :(String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: OafDataset => + val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] + a.setClazz(classOf[OafDataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case software: Software => + val a: AtomicAction[Software] = new AtomicAction[Software] + a.setClazz(classOf[Software]) + a.setPayload(software) + (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case orp: OtherResearchProduct => + val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] + a.setClazz(classOf[OtherResearchProduct]) + a.setPayload(orp) + (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) + + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + + } + + + + + def embargo_end(embargo_end_date: String): Boolean = { + val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) + val td = LocalDate.now() + td.isAfter(dt) + } + + + def extract_date(input: String): Option[String] = { + val d = Date_regex.map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + } + ).find(s => s != null) + + if (d.isDefined) { + val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get + try { + return Some(LocalDate.parse(a_date, df_en).toString) + } catch { + case _: Throwable => try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => try { + return None + } + } + } + } + d + } + + def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = { + if (resourceType != null && resourceType.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + } + if (schemaOrg != null && schemaOrg.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + null + } + + + def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): Result = { + val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (typeQualifiers == null) + return null + val i = new Instance + i.setInstancetype(typeQualifiers._1) + typeQualifiers._2.getClassname match { + case "dataset" => + val r = new OafDataset + r.setInstance(List(i).asJava) + return r + case "publication" => + val r = new Publication + r.setInstance(List(i).asJava) + return r + case "software" => + val r = new Software + r.setInstance(List(i).asJava) + return r + case "other" => + val r = new OtherResearchProduct + r.setInstance(List(i).asJava) + return r + } + null + } + + + def available_date(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val l: List[String] = for { + JObject(dates) <- json \\ "dates" + JField("dateType", JString(dateTypes)) <- dates + } yield dateTypes + + l.exists(p => p.equalsIgnoreCase("available")) + + } + + + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { + OafMapperUtils.structuredProperty(dt, q, null) + } + + def generateRelation(sourceId:String, targetId:String, relClass:String, cf:KeyValue, di:DataInfo) :Relation = { + + val r = new Relation + r.setSource(sourceId) + r.setTarget(targetId) + r.setRelType("resultProject") + r.setRelClass(relClass) + r.setSubRelType("outcome") + r.setCollectedfrom(List(cf).asJava) + r.setDataInfo(di) + r + + + } + + def get_projectRelation(awardUri:String, sourceId:String):List[Relation] = { + val match_pattern = funder_regex.find(s =>s._1.matcher(awardUri).find()) + + if (match_pattern.isDefined) { + val m =match_pattern.get._1 + val p = match_pattern.get._2 + val grantId = m.matcher(awardUri).replaceAll("$2") + val targetId = s"$p${DHPUtils.md5(grantId)}" + List( + generateRelation(sourceId, targetId,"isProducedBy", DATACITE_COLLECTED_FROM, dataInfo), + generateRelation(targetId, sourceId,"produces", DATACITE_COLLECTED_FROM, dataInfo) + ) + } + else + List() + + } + + + def generateOAF(input:String,ts:Long, dateOfCollection:Long, vocabularies: VocabularyGroup):List[Oaf] = { + if (filter_json(input)) + return List() + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) + val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) + + val doi = (json \ "attributes" \ "doi").extract[String] + if (doi.isEmpty) + return List() + + //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies + val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (result == null) + return List() + + + val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi") + val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) + result.setPid(List(pid).asJava) + result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) + result.setOriginalId(List(doi).asJava) + result.setDateofcollection(s"${dateOfCollection}") + result.setDateoftransformation(s"$ts") + result.setDataInfo(dataInfo) + + val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) + + + val authors = creators.zipWithIndex.map { case (c, idx) => + val a = new Author + a.setFullname(c.name.orNull) + a.setName(c.givenName.orNull) + a.setSurname(c.familyName.orNull) + if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { + a.setPid(c.nameIdentifiers.get.map(ni => { + val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null + if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) { + OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + } + else + null + + } + ) + .asJava) + } + if (c.affiliation.isDefined) + a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava) + a.setRank(idx + 1) + a + } + + + + + val titles:List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) + + result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null) + } else { + OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null) + } + }).asJava) + + if(authors==null || authors.isEmpty || !authors.exists(a => a !=null)) + return List() + result.setAuthor(authors.asJava) + + val dates = (json \\ "dates").extract[List[DateType]] + val publication_year = (json \\ "publicationYear").extractOrElse[String](null) + + val i_date = dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .find(d => d.dateType.get.equalsIgnoreCase("issued")) + .map(d => extract_date(d.date.get)) + val a_date: Option[String] = dates + .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) + .map(d => extract_date(d.date.get)) + .find(d => d != null && d.isDefined) + .map(d => d.get) + + if (a_date.isDefined) { + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + } + if (i_date.isDefined && i_date.get.isDefined) { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } + else if (publication_year != null) { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + + + result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => (d._1.get, vocabularies.getTermAsQualifier("dnet:dataCite_date", d._2.toLowerCase()))) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)).asJava) + + val subjects = (json \\ "subjects").extract[List[SubjectType]] + + result.setSubject(subjects.filter(s => s.subject.nonEmpty) + .map(s => + OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null) + ).asJava) + + + result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + + val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] + + result.setDescription( + descriptions + .filter(d => d.description.isDefined). + map(d => + OafMapperUtils.field(d.description.get, null) + ).filter(s => s!=null).asJava) + + + val publisher = (json \\ "publisher").extractOrElse[String](null) + if (publisher != null) + result.setPublisher(OafMapperUtils.field(publisher, null)) + + + val language: String = (json \\ "language").extractOrElse[String](null) + + if (language != null) + result.setLanguage(vocabularies.getSynonymAsQualifier("dnet:languages", language)) + + + val instance = result.getInstance().get(0) + + val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String] + + val accessRights:List[String] = for { + JObject(rightsList) <- json \\ "rightsList" + JField("rightsUri", JString(rightsUri)) <- rightsList + } yield rightsUri + + val aRights: Option[Qualifier] = accessRights.map(r => { + vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r) + }).find(q => q != null) + + + val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY) + + if (client.isDefined) { + val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) + instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) + instance.setCollectedfrom(DATACITE_COLLECTED_FROM) + instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) + instance.setAccessright(access_rights_qualifier) + + //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]"> + val license = accessRights + .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) + if (license.isDefined) + instance.setLicense(OafMapperUtils.field(license.get, null)) + } + + + val awardUris:List[String] = for { + JObject(fundingReferences) <- json \\ "fundingReferences" + JField("awardUri", JString(awardUri)) <- fundingReferences + } yield awardUri + + val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null) + + if (relations!= null && relations.nonEmpty) { + List(result):::relations + } + else + List(result) + } + + def generateDataInfo(trust: String): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust(trust) + di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions")) + di + } + + def generateDSId(input: String): String = { + val b = StringUtils.substringBefore(input, "::") + val a = StringUtils.substringAfter(input, "::") + s"10|$b::${DHPUtils.md5(a)}" + } + + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala new file mode 100644 index 000000000..9f0d25735 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala @@ -0,0 +1,41 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapred.SequenceFileOutputFormat +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object ExportActionSetJobNode { + + val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(ExportActionSetJobNode.getClass.getSimpleName) + .master(master) + .getOrCreate() + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING) + + spark.read.load(sourcePath).as[Oaf] + .map(o =>DataciteToOAFTransformation.toActionSet(o)) + .filter(o => o!= null) + .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + + + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala new file mode 100644 index 000000000..6837e94b2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -0,0 +1,48 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.model.mdstore.MetadataRecord +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object GenerateDataciteDatasetSpark { + + val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) + .master(master) + .getOrCreate() + + implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] + + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + + import spark.implicits._ + + spark.read.load(sourcePath).as[DataciteType] + .filter(d => d.isActive) + .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies)) + .filter(d => d != null) + .write.mode(SaveMode.Overwrite).save(targetPath) + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala new file mode 100644 index 000000000..06fcbb518 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -0,0 +1,168 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.hadoop.io.{IntWritable, SequenceFile, Text} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import org.apache.spark.sql.functions.max +import org.slf4j.{Logger, LoggerFactory} + +import java.time.format.DateTimeFormatter._ +import java.time.{LocalDateTime, ZoneOffset} +import scala.io.Source + +object ImportDatacite { + + val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) + + + def convertAPIStringToDataciteItem(input:String): DataciteType = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase + + val isActive = (json \ "attributes" \ "isActive").extract[Boolean] + + val timestamp_string = (json \ "attributes" \ "updated").extract[String] + val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) + DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli/1000, isActive = isActive, json = input) + + } + + + + def main(args: Array[String]): Unit = { + + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + + val hdfsuri = parser.get("namenode") + log.info(s"namenode is $hdfsuri") + + val targetPath = parser.get("targetPath") + log.info(s"targetPath is $targetPath") + + val dataciteDump = parser.get("dataciteDumpPath") + log.info(s"dataciteDump is $dataciteDump") + + val hdfsTargetPath =new Path(targetPath) + log.info(s"hdfsTargetPath is $hdfsTargetPath") + + val spark: SparkSession = SparkSession.builder() + .appName(ImportDatacite.getClass.getSimpleName) + .master(master) + .getOrCreate() + + // ====== Init HDFS File System Object + val conf = new Configuration + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri) + + // Because of Maven + conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) + conf.set("fs.file.impl", classOf[LocalFileSystem].getName) + val sc:SparkContext = spark.sparkContext + sc.setLogLevel("ERROR") + + import spark.implicits._ + + + val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { + + override def zero: DataciteType = null + + override def reduce(a: DataciteType, b: DataciteType): DataciteType = { + if (b == null) + return a + if (a == null) + return b + if(a.timestamp >b.timestamp) { + return a + } + b + } + + override def merge(a: DataciteType, b: DataciteType): DataciteType = { + reduce(a,b) + } + + override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def finish(reduction: DataciteType): DataciteType = reduction + } + + val dump:Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] + val ts = dump.select(max("timestamp")).first().getLong(0) + + log.info(s"last Timestamp is $ts") + + val cnt = writeSequenceFile(hdfsTargetPath, ts, conf) + + log.info(s"Imported from Datacite API $cnt documents") + + if (cnt > 0) { + + val inputRdd:RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) + .map(s => s._2.toString) + .map(s => convertAPIStringToDataciteItem(s)) + spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") + + val ds:Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] + + dump + .union(ds) + .groupByKey(_.doi) + .agg(dataciteAggregator.toColumn) + .map(s=>s._2) + .repartition(4000) + .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") + + val fs = FileSystem.get(sc.hadoopConfiguration) + fs.delete(new Path(s"$dataciteDump"), true) + fs.rename(new Path(s"${dataciteDump}_updated"),new Path(s"$dataciteDump")) + } + } + + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration):Long = { + val client = new DataciteAPIImporter(timestamp*1000, 1000) + var i = 0 + try { + val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) + try { + + var start: Long = System.currentTimeMillis + var end: Long = 0 + val key: IntWritable = new IntWritable(i) + val value: Text = new Text + while ( { + client.hasNext + }) { + key.set({ + i += 1; + i - 1 + }) + value.set(client.next()) + writer.append(key, value) + writer.hflush() + if (i % 1000 == 0) { + end = System.currentTimeMillis + val time = (end - start) / 1000.0F + println(s"Imported $i in $time seconds") + start = System.currentTimeMillis + } + } + } finally if (writer != null) writer.close() + } + i + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter new file mode 100644 index 000000000..ad80d6998 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter @@ -0,0 +1,28 @@ +TUBYDI - Assistir Filmes e Series Online Grátis +123Movies +WATCH FULL MOVIE +Movierulz +Full Movie Online +MOVIé WatcH +The King of Staten Island 2020 Online For Free +Watch Train to Busan 2 2020 online for free +Sixth Sense Movie Novelization +Film Complet streaming vf gratuit en ligne +watch now free +LIVE stream watch +LIVE stream UFC +RBC Heritage live stream +MLBStreams Free +NFL Live Stream +Live Stream Free +Royal Ascot 2020 Live Stream +TV Shows Full Episodes Official +FuboTV +Gomovies +Online Free Trial Access +123watch +DÜŞÜK HAPI +Bebek Düşürme Yöntemleri +WHATSAP İLETİŞİM +Cytotec +düşük hapı \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json new file mode 100644 index 000000000..63e080337 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json new file mode 100644 index 000000000..34fa3ed99 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json @@ -0,0 +1,33 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "tr", + "paramLongName": "transformationRule", + "paramDescription": "the transformation Rule", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the isLookup URL", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json new file mode 100644 index 000000000..d014dab5a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json @@ -0,0 +1,1032 @@ +{ + "SND.QOG": { + "openaire_id": "re3data_____::r3d100012231", + "datacite_name": "Quality of Government Institute", + "official_name": "Quality of Government Institute's Data", + "similarity": 0.8985507246376812 + }, + "GESIS.CESSDA": { + "openaire_id": "re3data_____::r3d100010202", + "datacite_name": "CESSDA ERIC", + "official_name": "CESSDA ERIC" + }, + "BL.CRAN": { + "openaire_id": "re3data_____::r3d100012068", + "datacite_name": "Cranfield University", + "official_name": "Cranfield Online Research Data" + }, + "SUL.OPENNEURO": { + "openaire_id": "re3data_____::r3d100010924", + "datacite_name": "OpenNeuro", + "official_name": "OpenNeuro" + }, + "UNAVCO.UNAVCO": { + "openaire_id": "re3data_____::r3d100010872", + "datacite_name": "UNAVCO", + "official_name": "UNAVCO" + }, + "SUL.SDR": { + "openaire_id": "re3data_____::r3d100010710", + "datacite_name": "Stanford Digital Repository", + "official_name": "Stanford Digital Repository" + }, + "DK.ICES": { + "openaire_id": "re3data_____::r3d100011288", + "datacite_name": "International Council for the Exploration of the Sea (ICES)", + "official_name": "International Council for the Exploration of the Sea datasets", + "similarity": 0.8833333333333333 + }, + "CISTI.DFOSCIMR": { + "openaire_id": "re3data_____::r3d100012039", + "datacite_name": "Bedford Institute of Oceanography - Fisheries and Oceans Canada - Ocean Data and Information Section", + "official_name": "Bedford Institute of Oceanography - Oceanographic Databases" + }, + "CSIC.DIGITAL": { + "openaire_id": "re3data_____::r3d100011076", + "datacite_name": "Digital CSIC", + "official_name": "DIGITAL.CSIC" + }, + "TIB.PANGAEA": { + "openaire_id": "re3data_____::r3d100010134", + "datacite_name": "PANGAEA", + "official_name": "PANGAEA" + }, + "PSU.DATACOM": { + "openaire_id": "re3data_____::r3d100010477", + "datacite_name": "Data Commons", + "official_name": "ANU Data Commons", + "similarity": 0.8571428571428571 + }, + "ANDS.CENTRE72": { + "openaire_id": "re3data_____::r3d100010451", + "datacite_name": "PARADISEC", + "official_name": "Pacific and Regional Archive for Digital Sources in Endangered Cultures" + }, + "BL.OXDB": { + "openaire_id": "re3data_____::r3d100011653", + "datacite_name": "Oxford University Library Service Databank", + "official_name": "DataBank, Bodleian Libraries, University of Oxford" + }, + "BL.STANDREW": { + "openaire_id": "re3data_____::r3d100012411", + "datacite_name": "University of St Andrews", + "official_name": "St Andrews Research portal - Research Data" + }, + "TIB.BAFG": { + "openaire_id": "re3data_____::r3d100011664", + "datacite_name": "Bundesanstalt f\u00fcr Gew\u00e4sserkunde", + "official_name": "Geoportal der BFG" + }, + "CRUI.UNIBO": { + "openaire_id": "re3data_____::r3d100012604", + "datacite_name": "Universit\u00e0 degli Studi di Bologna", + "official_name": "AMS Acta" + }, + "GDCC.ODUM-LIBRARY": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "UNC Libraries", + "official_name": "UNC Dataverse" + }, + "RG.RG": { + "openaire_id": "re3data_____::r3d100012227", + "datacite_name": "ResearchGate", + "official_name": "ResearchGate" + }, + "TIB.EUMETSAT": { + "openaire_id": "re3data_____::r3d100010232", + "datacite_name": "EUMETSAT", + "official_name": "Eumetsat" + }, + "SND.SMHI": { + "openaire_id": "re3data_____::r3d100011776", + "datacite_name": "Swedish Meteorological and Hydrological Institute open data", + "official_name": "Swedish Meteorological and Hydrological Institute open data" + }, + "NOAA.NCEI": { + "openaire_id": "re3data_____::r3d100011801", + "datacite_name": "National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI)", + "official_name": "NCEI" + }, + "TIB.WDCC": { + "openaire_id": "re3data_____::r3d100010299", + "datacite_name": "World Data Center for Climate", + "official_name": "World Data Center for Climate" + }, + "CNGB.GIGADB": { + "openaire_id": "re3data_____::r3d100010478", + "datacite_name": "GigaDB", + "official_name": "GigaDB" + }, + "DELFT.VLIZ": { + "openaire_id": "re3data_____::r3d100010661", + "datacite_name": "Vlaams Instituut voor de Zee", + "official_name": "Flanders Marine Institute" + }, + "NUS.SB": { + "openaire_id": "re3data_____::r3d100012564", + "datacite_name": "National University of Singapore", + "official_name": "ScholarBank@NUS" + }, + "EDI.EDI": { + "openaire_id": "re3data_____::r3d100010272", + "datacite_name": "Environmental Data Initiative", + "official_name": "Environmental Data Initiative Repository" + }, + "INIST.ADISP": { + "openaire_id": "re3data_____::r3d100010494", + "datacite_name": "Quetelet PROGEDO Diffusion", + "official_name": "Quetelet PROGEDO Diffusion" + }, + "GESIS.SHARE": { + "openaire_id": "re3data_____::r3d100010430", + "datacite_name": "SHARE - ERIC", + "official_name": "Survey of Health, Ageing and Retirement in Europe" + }, + "ANDS.CENTRE-1": { + "openaire_id": "re3data_____::r3d100010864", + "datacite_name": "Griffith University", + "official_name": "Griffith University Research Data Repository" + }, + "BL.READING": { + "openaire_id": "re3data_____::r3d100012064", + "datacite_name": "University of Reading", + "official_name": "University of Reading Research Data Archive" + }, + "CORNELL.CISER": { + "openaire_id": "re3data_____::r3d100011056", + "datacite_name": "CISER Data Archive", + "official_name": "CISER Data Archive" + }, + "DRYAD.DRYAD": { + "openaire_id": "re3data_____::r3d100000044", + "datacite_name": "DRYAD", + "official_name": "DRYAD" + }, + "CDL.PISCO": { + "openaire_id": "re3data_____::r3d100010947", + "datacite_name": "Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO)", + "official_name": "Partnership for Interdisciplinary Studies of Coastal Oceans" + }, + "IEEE.DATAPORT": { + "openaire_id": "re3data_____::r3d100012569", + "datacite_name": "IEEE DataPort", + "official_name": "IEEE DataPort" + }, + "DELFT.MAASTRO": { + "openaire_id": "re3data_____::r3d100011086", + "datacite_name": "MAASTRO Clinic", + "official_name": "CancerData.org" + }, + "USGS.PROD": { + "openaire_id": "re3data_____::r3d100010054", + "datacite_name": "USGS DOI Tool Production Environment", + "official_name": "U.S. Geological Survey" + }, + "GDCC.ODUM-DV": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "Odum Institute Dataverse", + "official_name": "UNC Dataverse" + }, + "CDL.SDSCSG": { + "openaire_id": "re3data_____::r3d100011690", + "datacite_name": "UCSD Signaling Gateway", + "official_name": "UCSD Signaling gateway" + }, + "ORBIS.NKN": { + "openaire_id": "re3data_____::r3d100011587", + "datacite_name": "Northwest Knowledge Network", + "official_name": "Northwest Knowledge Network" + }, + "ANDS.CENTRE63": { + "openaire_id": "re3data_____::r3d100010918", + "datacite_name": "Test: Atlas of Living Australia", + "official_name": "Atlas of Living Australia", + "similarity": 0.8928571428571429 + }, + "SML.TALKBANK": { + "openaire_id": "re3data_____::r3d100010887", + "datacite_name": "TalkBank", + "official_name": "TalkBank" + }, + "CORNELL.LIBRARY": { + "openaire_id": "re3data_____::r3d100012322", + "datacite_name": "Cornell University Library", + "official_name": "eCommons - Cornell's digital repository" + }, + "BL.SOTON": { + "openaire_id": "re3data_____::r3d100011245", + "datacite_name": "University of Southampton", + "official_name": "University of Southampton Institutional Research Repository" + }, + "GESIS.DB-BANK": { + "openaire_id": "re3data_____::r3d100012252", + "datacite_name": "Forschungsdaten- und Servicezentrum der Bundesbank", + "official_name": "Forschungsdaten- und Servicezentrum der Bundesbank" + }, + "ANDS.CENTRE68": { + "openaire_id": "re3data_____::r3d100010918", + "datacite_name": "Atlas of Living Australia", + "official_name": "Atlas of Living Australia" + }, + "ANDS.CENTRE69": { + "openaire_id": "re3data_____::r3d100010914", + "datacite_name": "Australian Ocean Data Network", + "official_name": "Australian Ocean Data Network Portal" + }, + "INIST.CDS": { + "openaire_id": "re3data_____::r3d100010584", + "datacite_name": "Strasbourg Astronomical Data Center", + "official_name": "Strasbourg Astronomical Data Center" + }, + "BL.NHM": { + "openaire_id": "re3data_____::r3d100011675", + "datacite_name": "Natural History Museum, London", + "official_name": "Natural History Museum, Data Portal" + }, + "BL.ADS": { + "openaire_id": "re3data_____::r3d100000006", + "datacite_name": "Archaeology Data Service", + "official_name": "Archaeology Data Service" + }, + "GDCC.JHU": { + "openaire_id": "re3data_____::r3d100011836", + "datacite_name": "Johns Hopkins University Library", + "official_name": "Johns Hopkins Data Archive Dataverse Network" + }, + "BL.ED": { + "openaire_id": "re3data_____::r3d100000047", + "datacite_name": "University of Edinburgh", + "official_name": "Edinburgh DataShare" + }, + "BL.EXETER": { + "openaire_id": "re3data_____::r3d100011202", + "datacite_name": "University of Exeter", + "official_name": "Open Research Exeter" + }, + "BL.NCL": { + "openaire_id": "re3data_____::r3d100012408", + "datacite_name": "Newcastle University", + "official_name": "NCL Data" + }, + "BROWN.BDR": { + "openaire_id": "re3data_____::r3d100011654", + "datacite_name": "Brown Digital Repository", + "official_name": "Brown Digital Repository" + }, + "GDCC.SYR-QDR": { + "openaire_id": "re3data_____::r3d100011038", + "datacite_name": "Syracuse University Qualitative Data Repository", + "official_name": "Qualitative Data Repository" + }, + "BL.BRISTOL": { + "openaire_id": "re3data_____::r3d100011099", + "datacite_name": "University of Bristol", + "official_name": "data.bris Research Data Repository" + }, + "DATACITE.DATACITE": { + "openaire_id": "openaire____::datacite", + "datacite_name": "DataCite", + "official_name": "Datacite" + }, + "ESTDOI.KEEL": { + "openaire_id": "re3data_____::r3d100011941", + "datacite_name": "Keeleressursid. The Center of Estonian Language Resources", + "official_name": "Center of Estonian Language Resources" + }, + "BL.ESSEX": { + "openaire_id": "re3data_____::r3d100012405", + "datacite_name": "University of Essex", + "official_name": "Research Data at Essex" + }, + "PURDUE.MDF": { + "openaire_id": "re3data_____::r3d100012080", + "datacite_name": "Univ Chicago Materials Data Facility", + "official_name": "Materials Data Facility" + }, + "DELFT.KNMI": { + "openaire_id": "re3data_____::r3d100011879", + "datacite_name": "KNMI Data Centre", + "official_name": "KNMI Data Centre" + }, + "CUL.CIESIN": { + "openaire_id": "re3data_____::r3d100010207", + "datacite_name": "Center for International Earth Science Information Network", + "official_name": "Center for International Earth Science Information Network" + }, + "WISC.NEOTOMA": { + "openaire_id": "re3data_____::r3d100011761", + "datacite_name": "Neotoma Paleoecological Database", + "official_name": "Neotoma Paleoecology Database", + "similarity": 0.9180327868852459 + }, + "IRIS.IRIS": { + "openaire_id": "re3data_____::r3d100010268", + "datacite_name": "Incorporated Research Institutions for Seismology", + "official_name": "Incorporated Research Institutions for Seismology" + }, + "ANDS.CENTRE50": { + "openaire_id": "re3data_____::r3d100012378", + "datacite_name": "Analysis and Policy Observatory", + "official_name": "Analysis and Policy Observatory" + }, + "FAO.RING": { + "openaire_id": "re3data_____::r3d100012571", + "datacite_name": "CIARD RING", + "official_name": "CIARD Ring" + }, + "CUL.R2R": { + "openaire_id": "re3data_____::r3d100010735", + "datacite_name": "Rolling Deck to Repository", + "official_name": "Rolling Deck to Repository" + }, + "DEMO.GRIIDC": { + "openaire_id": "re3data_____::r3d100011571", + "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative", + "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative" + }, + "ANDS.CENTRE-6": { + "openaire_id": "re3data_____::r3d100012268", + "datacite_name": "Curtin University", + "official_name": "Curtin University Research Data Collection" + }, + "ANDS.CENTRE-5": { + "openaire_id": "re3data_____::r3d100012013", + "datacite_name": "TERN Central Portal", + "official_name": "TERN Data Discovery portal" + }, + "FIGSHARE.UCT": { + "openaire_id": "re3data_____::r3d100012633", + "datacite_name": "University of Cape Town (UCT)", + "official_name": "ZivaHub" + }, + "BIBSYS.UIT-ORD": { + "openaire_id": "re3data_____::r3d100012538", + "datacite_name": "DataverseNO", + "official_name": "DataverseNO" + }, + "CISTI.CADC": { + "openaire_id": "re3data_____::r3d100000016", + "datacite_name": "Canadian Astronomy Data Centre", + "official_name": "The Canadian Astronomy Data Centre", + "similarity": 0.9375 + }, + "BL.CCDC": { + "openaire_id": "re3data_____::r3d100010197", + "datacite_name": "The Cambridge Crystallographic Data Centre", + "official_name": "The Cambridge Structural Database" + }, + "BL.UCLD": { + "openaire_id": "re3data_____::r3d100012417", + "datacite_name": "University College London", + "official_name": "UCL Discovery" + }, + "GESIS.RKI": { + "openaire_id": "re3data_____::r3d100010436", + "datacite_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute", + "official_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute" + }, + "BL.DRI": { + "openaire_id": "re3data_____::r3d100011805", + "datacite_name": "Digital Repository of Ireland", + "official_name": "Digital Repository of Ireland" + }, + "TIB.KIT-IMK": { + "openaire_id": "re3data_____::r3d100011956", + "datacite_name": "Institute for Meteorology and Climate Research - Atmospheric Trace Gases and Remote Sensing", + "official_name": "CARIBIC" + }, + "DOINZ.LANDCARE": { + "openaire_id": "re3data_____::r3d100011662", + "datacite_name": "Landcare Research New Zealand Ltd", + "official_name": "Landcare Research Data Repository" + }, + "DEMO.EMORY": { + "openaire_id": "re3data_____::r3d100011559", + "datacite_name": "The Cancer Imaging Archive", + "official_name": "The Cancer Imaging Archive" + }, + "UMN.DRUM": { + "openaire_id": "re3data_____::r3d100011393", + "datacite_name": "Data Repository for the University of Minnesota", + "official_name": "Data Repository for the University of Minnesota" + }, + "CISTI.SFU": { + "openaire_id": "re3data_____::r3d100012512", + "datacite_name": "Simon Fraser University", + "official_name": "SFU Radar" + }, + "GESIS.ICPSR": { + "openaire_id": "re3data_____::r3d100010255", + "datacite_name": "ICPSR", + "official_name": "Inter-university Consortium for Political and Social Research" + }, + "ANDS.CENTRE49": { + "openaire_id": "re3data_____::r3d100012145", + "datacite_name": "The University of Melbourne", + "official_name": "melbourne.figshare.com" + }, + "ZBW.IFO": { + "openaire_id": "re3data_____::r3d100010201", + "datacite_name": "LMU-ifo Economics & Business Data Center", + "official_name": "LMU-ifo Economics & Business Data Center" + }, + "TIB.BEILST": { + "openaire_id": "re3data_____::r3d100012329", + "datacite_name": "Beilstein-Institut zur F\u00f6rderung der Chemischen Wissenschaften", + "official_name": "STRENDA DB" + }, + "ZBW.ZBW-JDA": { + "openaire_id": "re3data_____::r3d100012190", + "datacite_name": "ZBW Journal Data Archive", + "official_name": "ZBW Journal Data Archive" + }, + "BL.UKDA": { + "openaire_id": "re3data_____::r3d100010215", + "datacite_name": "UK Data Archive", + "official_name": "UK Data Archive" + }, + "CERN.INSPIRE": { + "openaire_id": "re3data_____::r3d100011077", + "datacite_name": "inspirehep.net", + "official_name": "Inspire-HEP" + }, + "CISTI.OTNDC": { + "openaire_id": "re3data_____::r3d100012083", + "datacite_name": "Ocean Tracking Network", + "official_name": "Ocean Tracking Network" + }, + "CISTI.CC": { + "openaire_id": "re3data_____::r3d100012646", + "datacite_name": "Compute Canada", + "official_name": "Federated Research Data Repository" + }, + "SND.ICOS": { + "openaire_id": "re3data_____::r3d100012203", + "datacite_name": "ICOS Carbon Portal", + "official_name": "ICOS Carbon Portal" + }, + "BL.MENDELEY": { + "openaire_id": "re3data_____::r3d100011868", + "datacite_name": "Mendeley", + "official_name": "Mendeley Data" + }, + "DELFT.UU": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "Universiteit Utrecht", + "official_name": "DataverseNL" + }, + "GESIS.DSZ-BO": { + "openaire_id": "re3data_____::r3d100010439", + "datacite_name": "Data Service Center for Business and Organizational Data", + "official_name": "Data Service Center for Business and Organizational Data" + }, + "TIB.IPK": { + "openaire_id": "re3data_____::r3d100011647", + "datacite_name": "IPK Gatersleben", + "official_name": "IPK Gatersleben" + }, + "GDCC.HARVARD-DV": { + "openaire_id": "re3data_____::r3d100010051", + "datacite_name": "Harvard IQSS Dataverse", + "official_name": "Harvard Dataverse" + }, + "BL.LEEDS": { + "openaire_id": "re3data_____::r3d100011945", + "datacite_name": "University of Leeds", + "official_name": "Research Data Leeds Repository" + }, + "BL.BRUNEL": { + "openaire_id": "re3data_____::r3d100012140", + "datacite_name": "Brunel University London", + "official_name": "Brunel figshare" + }, + "DEMO.ENVIDAT": { + "openaire_id": "re3data_____::r3d100012587", + "datacite_name": "EnviDat", + "official_name": "EnviDat" + }, + "GDCC.NTU": { + "openaire_id": "re3data_____::r3d100012440", + "datacite_name": "Nanyang Technological University", + "official_name": "DR-NTU (Data)" + }, + "UNM.DATAONE": { + "openaire_id": "re3data_____::r3d100000045", + "datacite_name": "DataONE", + "official_name": "DataONE" + }, + "CSC.NRD": { + "openaire_id": "re3data_____::r3d100012157", + "datacite_name": "Ministry of Culture and Education", + "official_name": "IDA Research Data Storage Service" + }, + "GESIS.DIPF": { + "openaire_id": "re3data_____::r3d100010390", + "datacite_name": "Research Data Centre for Education", + "official_name": "Research Data Centre for Education" + }, + "BL.HALLAM": { + "openaire_id": "re3data_____::r3d100011909", + "datacite_name": "Sheffield Hallam University", + "official_name": "Sheffield Hallam University Research Data Archive" + }, + "BL.LSHTM": { + "openaire_id": "re3data_____::r3d100011800", + "datacite_name": "London School of Hygiene and Tropical Medicine", + "official_name": "LSHTM Data Compass" + }, + "SUBGOE.DARIAH": { + "openaire_id": "re3data_____::r3d100011345", + "datacite_name": "Digital Research Infrastructure for the Arts and Humanities", + "official_name": "DARIAH-DE Repository" + }, + "SND.SU": { + "openaire_id": "re3data_____::r3d100012147", + "datacite_name": "Stockholm University", + "official_name": "Stockholm University repository for data" + }, + "GESIS.INDEPTH": { + "openaire_id": "re3data_____::r3d100011392", + "datacite_name": "INDEPTH Network", + "official_name": "INDEPTH Data Repository" + }, + "TIB.FLOSS": { + "openaire_id": "re3data_____::r3d100010863", + "datacite_name": "FLOSS Project, Syracuse University", + "official_name": "FLOSSmole" + }, + "ETHZ.WGMS": { + "openaire_id": "re3data_____::r3d100010627", + "datacite_name": "World Glacier Monitoring Service", + "official_name": "World Glacier Monitoring Service" + }, + "BL.UEL": { + "openaire_id": "re3data_____::r3d100012414", + "datacite_name": "University of East London", + "official_name": "Data.uel" + }, + "DELFT.DATA4TU": { + "openaire_id": "re3data_____::r3d100010216", + "datacite_name": "4TU.Centre for Research Data", + "official_name": "4TU.Centre for Research Data" + }, + "GESIS.IANUS": { + "openaire_id": "re3data_____::r3d100012361", + "datacite_name": "IANUS - FDZ Arch\u00e4ologie & Altertumswissenschaften", + "official_name": "IANUS Datenportal" + }, + "CDL.UCSDCCA": { + "openaire_id": "re3data_____::r3d100011655", + "datacite_name": "California Coastal Atlas", + "official_name": "California Coastal Atlas" + }, + "VIVA.VT": { + "openaire_id": "re3data_____::r3d100012601", + "datacite_name": "Virginia Tech", + "official_name": "VTechData" + }, + "ANDS.CENTRE39": { + "openaire_id": "re3data_____::r3d100011640", + "datacite_name": "University of the Sunshine Coast", + "official_name": "USC Research Bank research data" + }, + "DEMO.OPENKIM": { + "openaire_id": "re3data_____::r3d100011864", + "datacite_name": "OpenKIM", + "official_name": "OpenKIM" + }, + "INIST.OTELO": { + "openaire_id": "re3data_____::r3d100012505", + "datacite_name": "Observatoire Terre Environnement de Lorraine", + "official_name": "ORDaR" + }, + "INIST.ILL": { + "openaire_id": "re3data_____::r3d100012072", + "datacite_name": "Institut Laue-Langevin", + "official_name": "ILL Data Portal" + }, + "ANDS.CENTRE31": { + "openaire_id": "re3data_____::r3d100012378", + "datacite_name": "Test: Analysis and Policy Observatory", + "official_name": "Analysis and Policy Observatory", + "similarity": 0.9117647058823529 + }, + "ANDS.CENTRE30": { + "openaire_id": "re3data_____::r3d100010917", + "datacite_name": "Test: Geoscience Australia", + "official_name": "Geoscience Australia", + "similarity": 0.8695652173913043 + }, + "BL.SALFORD": { + "openaire_id": "re3data_____::r3d100012144", + "datacite_name": "University of Salford", + "official_name": "University of Salford Data Repository" + }, + "CERN.HEPDATA": { + "openaire_id": "re3data_____::r3d100010081", + "datacite_name": "HEPData.net", + "official_name": "HEPData" + }, + "ETHZ.E-COLL": { + "openaire_id": "re3data_____::r3d100012557", + "datacite_name": "ETH Z\u00fcrich Research Collection", + "official_name": "ETH Z\u00fcrich Research Collection" + }, + "GBIF.GBIF": { + "openaire_id": "re3data_____::r3d100000039", + "datacite_name": "Global Biodiversity Information Facility", + "official_name": "Global Biodiversity Information Facility" + }, + "ORNLDAAC.DAAC": { + "openaire_id": "re3data_____::r3d100000037", + "datacite_name": "Oak Ridge National Laboratory Distributed Active Archive Center", + "official_name": "Oak Ridge National Laboratory Distributed Active Archive Center for Biogeochemical Dynamics" + }, + "KAUST.KAUSTREPO": { + "openaire_id": "re3data_____::r3d100011898", + "datacite_name": "KAUST Research Repository", + "official_name": "UWA Research Repository", + "similarity": 0.875 + }, + "ZBW.ZEW": { + "openaire_id": "re3data_____::r3d100010399", + "datacite_name": "Zentrum f\u00fcr Europ\u00e4ische Wirtschaftsforschung GmbH (ZEW)", + "official_name": "ZEW Forschungsdatenzentrum" + }, + "SML.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "Digital Antiquity (TDAR)", + "official_name": "tDAR" + }, + "GESIS.CSDA": { + "openaire_id": "re3data_____::r3d100010484", + "datacite_name": "Czech Social Science Data Archive", + "official_name": "Czech Social Science Data Archive" + }, + "SND.BOLIN": { + "openaire_id": "re3data_____::r3d100011699", + "datacite_name": "Bolin Centre Database", + "official_name": "Bolin Centre Database" + }, + "MLA.HC": { + "openaire_id": "re3data_____::r3d100012309", + "datacite_name": "Humanities Commons", + "official_name": "Humanities Commons" + }, + "CDL.IDASHREP": { + "openaire_id": "re3data_____::r3d100010382", + "datacite_name": "iDASH Repository", + "official_name": "IDS Repository", + "similarity": 0.8666666666666667 + }, + "ZBMED.SNSB": { + "openaire_id": "re3data_____::r3d100011873", + "datacite_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns", + "official_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns - datasets", + "similarity": 0.9043478260869565 + }, + "ORBIS.OHSU": { + "openaire_id": "re3data_____::r3d100012244", + "datacite_name": "Oregon Health Sciences University", + "official_name": "OHSU Digital Commons" + }, + "DARTLIB.CRAWDAD": { + "openaire_id": "re3data_____::r3d100010716", + "datacite_name": "CRAWDAD", + "official_name": "CRAWDAD" + }, + "CDL.CCHDO": { + "openaire_id": "re3data_____::r3d100010831", + "datacite_name": "CLIVAR and Carbon Hydrographic Data Office", + "official_name": "Climate Variability and Predictability and Carbon Hydrographic Data Office" + }, + "GESIS.AUSSDA": { + "openaire_id": "re3data_____::r3d100010483", + "datacite_name": "Austrian Social Science Data Archive", + "official_name": "AUSSDA" + }, + "NSIDC.DATACTR": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "TIB.RADAR": { + "openaire_id": "re3data_____::r3d100012330", + "datacite_name": "FIZ Karlsruhe \u2013 Leibniz-Institut f\u00fcr Informationsinfrastruktur", + "official_name": "RADAR" + }, + "KIM.OPENKIM": { + "openaire_id": "re3data_____::r3d100011864", + "datacite_name": "Open Knowledgebase of Interatomic Models (OpenKIM)", + "official_name": "OpenKIM" + }, + "BL.LBORO": { + "openaire_id": "re3data_____::r3d100012143", + "datacite_name": "Loughborough University", + "official_name": "Loughborough Data Repository" + }, + "GESIS.ZPID": { + "openaire_id": "re3data_____::r3d100010328", + "datacite_name": "GESIS.ZPID", + "official_name": "PsychData" + }, + "SML.TCIA": { + "openaire_id": "re3data_____::r3d100011559", + "datacite_name": "The Cancer Imaging Archive", + "official_name": "The Cancer Imaging Archive" + }, + "CDL.IRIS": { + "openaire_id": "re3data_____::r3d100010268", + "datacite_name": "Incorporated Research Institutions for Seismology", + "official_name": "Incorporated Research Institutions for Seismology" + }, + "BIBSYS.NMDC": { + "openaire_id": "re3data_____::r3d100012291", + "datacite_name": "Norwegian Marine Data Centre", + "official_name": "Norwegian Polar Data Centre", + "similarity": 0.8727272727272727 + }, + "ANDS.CENTRE25": { + "openaire_id": "re3data_____::r3d100010917", + "datacite_name": "Geoscience Australia", + "official_name": "Geoscience Australia" + }, + "BL.UCLAN": { + "openaire_id": "re3data_____::r3d100012019", + "datacite_name": "University of Central Lancashire", + "official_name": "UCLanData" + }, + "ANDS.CENTRE23": { + "openaire_id": "re3data_____::r3d100011898", + "datacite_name": "The University of Western Australia", + "official_name": "UWA Research Repository" + }, + "CISTI.WOUDC": { + "openaire_id": "re3data_____::r3d100010367", + "datacite_name": "World Ozone and Ultraviolet Radiation Data Centre", + "official_name": "World Ozone and Ultraviolet Radiation Data Centre" + }, + "FIGSHARE.ARS": { + "openaire_id": "re3data_____::r3d10001066", + "datacite_name": "figshare Academic Research System", + "official_name": "figshare" + }, + "ILLINOIS.DATABANK": { + "openaire_id": "re3data_____::r3d100012001", + "datacite_name": "Illinois Data Bank", + "official_name": "Illinois Data Bank" + }, + "BL.ECMWF": { + "openaire_id": "re3data_____::r3d100011726", + "datacite_name": "European Centre for Medium-Range Weather Forecasts", + "official_name": "European Centre for Medium-Range Weather Forecasts" + }, + "CDL.ISSDA": { + "openaire_id": "re3data_____::r3d100010497", + "datacite_name": "Irish Social Science Data Archive (ISSDA)", + "official_name": "Irish Social Science Data Archive" + }, + "CDL.PQR": { + "openaire_id": "re3data_____::r3d100012225", + "datacite_name": "Pitt Quantum Repository", + "official_name": "Pitt Quantum Repository" + }, + "ANDS.CENTRE82": { + "openaire_id": "re3data_____::r3d100010138", + "datacite_name": "Test: Australian Data Archive", + "official_name": "Australian Data Archive", + "similarity": 0.8846153846153846 + }, + "GDCC.HARVARD-SLP": { + "openaire_id": "re3data_____::r3d100011861", + "datacite_name": "National Sleep Research Resource", + "official_name": "National Sleep Research Resource" + }, + "CDL.IMMPORT": { + "openaire_id": "re3data_____::r3d100012529", + "datacite_name": "UCSF ImmPort", + "official_name": "ImmPort" + }, + "GESIS.FID": { + "openaire_id": "re3data_____::r3d100012347", + "datacite_name": "FID f\u00fcr internationale und interdisziplin\u00e4re Rechtsforschung", + "official_name": "\u00b2Dok[\u00a7]" + }, + "OCEAN.OCEAN": { + "openaire_id": "re3data_____::r3d100012369", + "datacite_name": "Code Ocean", + "official_name": "Code Ocean" + }, + "CERN.ZENODO": { + "openaire_id": "re3data_____::r3d100010468", + "datacite_name": "Zenodo", + "official_name": "Zenodo" + }, + "ETHZ.DA-RD": { + "openaire_id": "re3data_____::r3d100011626", + "datacite_name": "ETHZ Data Archive - Research Data", + "official_name": "ETH Data Archive" + }, + "SND.ECDS": { + "openaire_id": "re3data_____::r3d100011000", + "datacite_name": "Environment Climate Data Sweden", + "official_name": "Environment Climate Data Sweden" + }, + "BL.BATH": { + "openaire_id": "re3data_____::r3d100011947", + "datacite_name": "University of Bath", + "official_name": "University of Bath Research Data Archive" + }, + "TIB.LDEO": { + "openaire_id": "re3data_____::r3d100012547", + "datacite_name": "LDEO - Lamont-Doherty Earth Observatory, Columbia University", + "official_name": "Lamont-Doherty Core Repository" + }, + "COS.OSF": { + "openaire_id": "re3data_____::r3d100011137", + "datacite_name": "Open Science Framework", + "official_name": "Open Science Framework" + }, + "ESTDOI.REPO": { + "openaire_id": "re3data_____::r3d100012333", + "datacite_name": "DataDOI", + "official_name": "DataDOI" + }, + "CDL.NSFADC": { + "openaire_id": "re3data_____::r3d100011973", + "datacite_name": "NSF Arctic Data Center", + "official_name": "NSF Arctic Data Center" + }, + "ANDS.CENTRE13": { + "openaire_id": "re3data_____::r3d100010477", + "datacite_name": "The Australian National University", + "official_name": "ANU Data Commons" + }, + "BL.NERC": { + "openaire_id": "re3data_____::r3d100010199", + "datacite_name": "Natural Environment Research Council", + "official_name": "Environmental Information Data Centre" + }, + "SAGEBIO.SYNAPSE": { + "openaire_id": "re3data_____::r3d100011894", + "datacite_name": "Synapse", + "official_name": "Synapse" + }, + "ANDS.CENTRE15": { + "openaire_id": "re3data_____::r3d100000038", + "datacite_name": "Australian Antarctic Division", + "official_name": "Australian Antarctic Data Centre" + }, + "WISC.BMRB": { + "openaire_id": "re3data_____::r3d100010191", + "datacite_name": "Biological Magnetic Resonance Bank", + "official_name": "Biological Magnetic Resonance Data Bank", + "similarity": 0.9315068493150684 + }, + "STSCI.MAST": { + "openaire_id": "re3data_____::r3d100010403", + "datacite_name": "Barbara A. Mikulski Archive for Space Telescopes", + "official_name": "Barbara A. Mikulski Archive for Space Telescopes" + }, + "CDL.NSIDC": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "BL.STRATH": { + "openaire_id": "re3data_____::r3d100012412", + "datacite_name": "University of Strathclyde", + "official_name": "University of Strathclyde KnowledgeBase Datasets" + }, + "DEMO.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "The Digital Archaeological Record (tDAR)", + "official_name": "tDAR" + }, + "TIND.CALTECH": { + "openaire_id": "re3data_____::r3d100012384", + "datacite_name": "CaltechDATA", + "official_name": "CaltechDATA" + }, + "GESIS.BIBB-FDZ": { + "openaire_id": "re3data_____::r3d100010190", + "datacite_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung", + "official_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung" + }, + "ANDS.CENTRE87": { + "openaire_id": "re3data_____::r3d100010138", + "datacite_name": "Australian Data Archive", + "official_name": "Australian Data Archive" + }, + "GESIS.NEPS": { + "openaire_id": "re3data_____::r3d100010736", + "datacite_name": "Nationales Bildungspanel (National Educational Panel Study, NEPS)", + "official_name": "Nationales Bildungspanel" + }, + "CDL.UCBCRCNS": { + "openaire_id": "re3data_____::r3d100011269", + "datacite_name": "Collaborative Research in Computational Neuroscience (CRCNS)", + "official_name": "Collaborative Research in Computational Neuroscience" + }, + "TIB.UKON": { + "openaire_id": "re3data_____::r3d100010469", + "datacite_name": "Movebank", + "official_name": "Movebank" + }, + "UMN.IPUMS": { + "openaire_id": "re3data_____::r3d100010794", + "datacite_name": "Minnesota Population Center", + "official_name": "Minnesota Population Center" + }, + "TIB.BIKF": { + "openaire_id": "re3data_____::r3d100012379", + "datacite_name": "Senckenberg Data & Metadata Repository", + "official_name": "Senckenberg Data & Metadata Repository" + }, + "TDL.GRIIDC": { + "openaire_id": "re3data_____::r3d100011571", + "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative", + "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative" + }, + "DELFT.NIBG": { + "openaire_id": "re3data_____::r3d100012167", + "datacite_name": "Sound and Vision", + "official_name": "Sound and Vision" + }, + "BL.SURREY": { + "openaire_id": "re3data_____::r3d100012232", + "datacite_name": "University of Surrey", + "official_name": "Surrey Research Insight" + }, + "OSTI.ORNLNGEE": { + "openaire_id": "re3data_____::r3d100011676", + "datacite_name": "NGEE-Arctic (Next Generation Ecosystems Experiement)", + "official_name": "NGEE Arctic" + }, + "TIB.WDCRSAT": { + "openaire_id": "re3data_____::r3d100010156", + "datacite_name": "World Data Center for Remote Sensing of the Atmosphere", + "official_name": "The World Data Center for Remote Sensing of the Atmosphere", + "similarity": 0.9642857142857143 + }, + "ZBMED.DSMZ": { + "openaire_id": "re3data_____::r3d100010219", + "datacite_name": "DSMZ", + "official_name": "DSMZ" + }, + "DOINZ.NZAU": { + "openaire_id": "re3data_____::r3d100012110", + "datacite_name": "University of Auckland Data Publishing and Discovery Service", + "official_name": "University of Auckland Data Repository" + }, + "INIST.RESIF": { + "openaire_id": "re3data_____::r3d100012222", + "datacite_name": "R\u00e9seau sismologique et g\u00e9od\u00e9sique fran\u00e7ais", + "official_name": "RESIF Seismic Data Portal" + }, + "CDL.NCEAS": { + "openaire_id": "re3data_____::r3d100010093", + "datacite_name": "National Center for Ecological Analysis and Synthesis (NCEAS)", + "official_name": "National Center for Ecological Analysis and Synthesis Data Repository" + }, + "ZBMED.EMP": { + "openaire_id": "re3data_____::r3d100010234", + "datacite_name": "eyeMoviePedia", + "official_name": "eyeMoviePedia" + }, + "ZBMED.BIOFRESH": { + "openaire_id": "re3data_____::r3d100011651", + "datacite_name": "Project BioFresh, Leibniz-Institute of Freshwater Ecology and Inland Fisheries", + "official_name": "Freshwater Biodiversity Data Portal" + }, + "INIST.IFREMER": { + "openaire_id": "re3data_____::r3d100011867", + "datacite_name": "Institut Fran\u00e7ais de Recherche pour l'Exploitation de la Mer", + "official_name": "SEANOE" + }, + "ETHZ.SICAS": { + "openaire_id": "re3data_____::r3d100011560", + "datacite_name": "SICAS", + "official_name": "Sicas Medical Image Repository" + }, + "SND.SND": { + "openaire_id": "re3data_____::r3d100010146", + "datacite_name": "Swedish National Data Service", + "official_name": "Swedish National Data Service" + }, + "DELFT.EASY": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "DANS", + "official_name": "DataverseNL" + }, + "WH.WHOAS": { + "openaire_id": "re3data_____::r3d100010423", + "datacite_name": "Woods Hole Open Access Server", + "official_name": "Woods Hole Open Access Server" + }, + "DATACITE.UCSC": { + "openaire_id": "re3data_____::r3d100010243", + "datacite_name": "UCSC Genome Browser", + "official_name": "UCSC Genome Browser" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json new file mode 100644 index 000000000..967e4445a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json @@ -0,0 +1,27 @@ +[ + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the sequencial file to write", + "paramRequired": true + }, + + { + "paramName": "d", + "paramLongName": "dataciteDumpPath", + "paramDescription": "the path of the Datacite dump", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml new file mode 100644 index 000000000..a3caa5e23 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -0,0 +1,103 @@ + + + + mdstoreInputPath + the path of the input MDStore + + + + mdstoreOutputPath + the path of the cleaned mdstore + + + nativeInputPath + the path of the input MDStore + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn-cluster + cluster + ImportDatacite + eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -t${nativeInputPath} + -d${mdstoreInputPath} + -n${nameNode} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + TransformJob + eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreInputPath} + --targetPath${mdstoreOutputPath} + --isLookupUrl${isLookupUrl} + -tr${isLookupUrl} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + ExportDataset + eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreOutputPath} + --targetPath${mdstoreOutputPath}_raw_AS + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 683986de2..170dc0dc8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -111,12 +111,12 @@ object DoiBoostMappingUtil { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } result.getInstance().asScala.foreach(i => { - i.setHostedby(getUbknownHostedBy()) + i.setHostedby(getUnknownHostedBy()) }) result } - def getUbknownHostedBy():KeyValue = { + def getUnknownHostedBy():KeyValue = { val hb = new KeyValue hb.setValue("Unknown Repository") hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 8043236e0..3ec391313 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -224,7 +224,7 @@ object DLIToOAF { if (cleanedPids.isEmpty) return null result.setId(generateId(inputPublication.getId)) - result.setDataInfo(generateDataInfo(invisibile = true)) + result.setDataInfo(generateDataInfo(invisible = true)) if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) return null result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)