From 446f81ee602000c0378f833fca9886c4128dd2ae Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 22 Feb 2022 11:42:57 +0100 Subject: [PATCH] wf to generate oaf from bioschema json datacite --- .../bioschema/BioschemaModelConstants.scala | 274 +++++++++ .../BioschemaToOAFTransformation.scala | 529 ++++++++++++++++++ .../eu/dnetlib/dhp/bioschema/ped_record.json | 41 ++ .../BioschemaDataciteToOAFTest.scala | 108 ++++ dhp-workflows/dhp-bmuse/sitemap.txt | 62 ++ 5 files changed, 1014 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json create mode 100644 dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala create mode 100644 dhp-workflows/dhp-bmuse/sitemap.txt diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala new file mode 100644 index 000000000..d44693f5a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala @@ -0,0 +1,274 @@ +package eu.dnetlib.dhp.bioschema + +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils +import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue} + +import java.io.InputStream +import java.time.format.DateTimeFormatter +import java.util.Locale +import java.util.regex.Pattern +import scala.io.Source + +/** This class represent the dataModel of the input Dataset of Datacite + * @param doi THE DOI + * @param timestamp timestamp of last update date + * @param isActive the record is active or deleted + * @param json the json native records + */ +case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} + +/* + The following class are utility class used for the mapping from + json datacite to OAF Shema + */ +case class RelatedIdentifierType( + relationType: String, + relatedIdentifier: String, + relatedIdentifierType: String +) {} + +case class NameIdentifiersType( + nameIdentifierScheme: Option[String], + schemeUri: Option[String], + nameIdentifier: Option[String] +) {} + +case class CreatorType( + nameType: Option[String], + nameIdentifiers: Option[List[NameIdentifiersType]], + name: Option[String], + familyName: Option[String], + givenName: Option[String], + affiliation: Option[List[String]] +) {} + +case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} + +case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {} + +case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} + +case class FundingReferenceType( + funderIdentifierType: Option[String], + awardTitle: Option[String], + awardUri: Option[String], + funderName: Option[String], + funderIdentifier: Option[String], + awardNumber: Option[String] +) {} + +case class DateType(date: Option[String], dateType: Option[String]) {} + +case class OAFRelations(relation: String, inverse: String, relType: String) + +class BioschemaModelConstants extends Serializable {} + +object BioschemaModelConstants { + + val REL_TYPE_VALUE: String = "resultResult" + val DATE_RELATION_KEY = "RelationDate" + val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter" + val DOI_CLASS = "doi" + val SUBJ_CLASS = "keywords" + val dataInfo: DataInfo = dataciteDataInfo("0.9") + + val subRelTypeMapping: Map[String, OAFRelations] = Map( + ModelConstants.REFERENCES -> OAFRelations( + ModelConstants.REFERENCES, + ModelConstants.IS_REFERENCED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_REFERENCED_BY -> OAFRelations( + ModelConstants.IS_REFERENCED_BY, + ModelConstants.REFERENCES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations( + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.SUPPLEMENT + ), + ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations( + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.SUPPLEMENT + ), + ModelConstants.HAS_PART -> OAFRelations( + ModelConstants.HAS_PART, + ModelConstants.IS_PART_OF, + ModelConstants.PART + ), + ModelConstants.IS_PART_OF -> OAFRelations( + ModelConstants.IS_PART_OF, + ModelConstants.HAS_PART, + ModelConstants.PART + ), + ModelConstants.IS_VERSION_OF -> OAFRelations( + ModelConstants.IS_VERSION_OF, + ModelConstants.HAS_VERSION, + ModelConstants.VERSION + ), + ModelConstants.HAS_VERSION -> OAFRelations( + ModelConstants.HAS_VERSION, + ModelConstants.IS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_IDENTICAL_TO -> OAFRelations( + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_CONTINUED_BY -> OAFRelations( + ModelConstants.IS_CONTINUED_BY, + ModelConstants.CONTINUES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.CONTINUES -> OAFRelations( + ModelConstants.CONTINUES, + ModelConstants.IS_CONTINUED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_NEW_VERSION_OF -> OAFRelations( + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations( + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SOURCE_OF -> OAFRelations( + ModelConstants.IS_SOURCE_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_DERIVED_FROM -> OAFRelations( + ModelConstants.IS_DERIVED_FROM, + ModelConstants.IS_SOURCE_OF, + ModelConstants.VERSION + ), + ModelConstants.CITES -> OAFRelations( + ModelConstants.CITES, + ModelConstants.IS_CITED_BY, + ModelConstants.CITATION + ), + ModelConstants.IS_CITED_BY -> OAFRelations( + ModelConstants.IS_CITED_BY, + ModelConstants.CITES, + ModelConstants.CITATION + ), + ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations( + ModelConstants.IS_VARIANT_FORM_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_OBSOLETED_BY -> OAFRelations( + ModelConstants.IS_OBSOLETED_BY, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.REVIEWS -> OAFRelations( + ModelConstants.REVIEWS, + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEW + ), + ModelConstants.IS_REVIEWED_BY -> OAFRelations( + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEWS, + ModelConstants.REVIEW + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.COMPILES -> OAFRelations( + ModelConstants.COMPILES, + ModelConstants.IS_COMPILED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_COMPILED_BY -> OAFRelations( + ModelConstants.IS_COMPILED_BY, + ModelConstants.COMPILES, + ModelConstants.RELATIONSHIP + ) + ) + + val datacite_filter: List[String] = { + val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH) + require(stream != null) + Source.fromInputStream(stream).getLines().toList + } + + def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + trust + ) + + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern( + "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", + Locale.ENGLISH + ) + + val df_it: DateTimeFormatter = + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + + val funder_regex: List[(Pattern, String)] = List( + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda__h2020::" + ), + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda_______::" + ) + ) + + val Date_regex: List[Pattern] = List( + //Y-M-D + Pattern.compile( + "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", + Pattern.MULTILINE + ), + //M-D-Y + Pattern.compile( + "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", + Pattern.MULTILINE + ), + //D-M-Y + Pattern.compile( + "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", + Pattern.MULTILINE + ), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ) + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala new file mode 100644 index 000000000..3461b9198 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala @@ -0,0 +1,529 @@ +package eu.dnetlib.dhp.bioschema + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._ +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import java.text.SimpleDateFormat +import java.time.LocalDate +import java.time.chrono.ThaiBuddhistDate +import java.time.format.DateTimeFormatter +import java.util.{Date, Locale} +import scala.collection.JavaConverters._ +import scala.io.{Codec, Source} + +object BioschemaToOAFTransformation { + + val mapper = new ObjectMapper() + + val DATA_INFO: DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + "0.9" + ) + + val collectedFromMap: Map[String, KeyValue] = { + val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|ped_________::changeme", + "PED" + ) + PEDCollectedFrom.setDataInfo(DATA_INFO) + + Map( + "ped" -> PEDCollectedFrom + ) + } + + /** This method should skip record if json contains invalid text + * defined in gile datacite_filter + * + * @param json + * @return True if the record should be skipped + */ + def skip_record(json: String): Boolean = { + datacite_filter.exists(f => json.contains(f)) + } + + @deprecated("this method will be removed", "dhp") + def toActionSet(item: Oaf): (String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: OafDataset => + val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] + a.setClazz(classOf[OafDataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case software: Software => + val a: AtomicAction[Software] = new AtomicAction[Software] + a.setClazz(classOf[Software]) + a.setPayload(software) + (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case orp: OtherResearchProduct => + val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] + a.setClazz(classOf[OtherResearchProduct]) + a.setPayload(orp) + (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) + + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + + } + + def embargo_end(embargo_end_date: String): Boolean = { + val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) + val td = LocalDate.now() + td.isAfter(dt) + } + + def extract_date(input: String): Option[String] = { + val d = Date_regex + .map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + }) + .find(s => s != null) + + if (d.isDefined) { + val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get + try { + return Some(LocalDate.parse(a_date, df_en).toString) + } catch { + case _: Throwable => + try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => + return None + } + } + } + d + } + + def fix_thai_date(input: String, format: String): String = { + try { + val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format)) + val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth) + LocalDate.from(d).toString + } catch { + case _: Throwable => "" + } + } + + def getTypeQualifier( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): (Qualifier, Qualifier) = { + if (resourceType != null && resourceType.nonEmpty) { + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + } + if (schemaOrg != null && schemaOrg.nonEmpty) { + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + + } + if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_PUBLICATION_RESOURCE, + resourceTypeGeneral + ) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + + } + null + } + + def getResult( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): Result = { + val typeQualifiers: (Qualifier, Qualifier) = + getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (typeQualifiers == null) + return null + val i = new Instance + i.setInstancetype(typeQualifiers._1) + typeQualifiers._2.getClassname match { + case "dataset" => + val r = new OafDataset + r.setInstance(List(i).asJava) + return r + case "publication" => + val r = new Publication + r.setInstance(List(i).asJava) + return r + case "software" => + val r = new Software + r.setInstance(List(i).asJava) + return r + case "other" => + val r = new OtherResearchProduct + r.setInstance(List(i).asJava) + return r + } + null + } + + def available_date(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val l: List[String] = for { + JObject(dates) <- json \\ "dates" + JField("dateType", JString(dateTypes)) <- dates + } yield dateTypes + + l.exists(p => p.equalsIgnoreCase("available")) + + } + + def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { + val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') + s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" + } + + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { + OafMapperUtils.structuredProperty(dt, q, null) + } + + def generateRelation( + sourceId: String, + targetId: String, + relClass: String, + cf: KeyValue, + di: DataInfo + ): Relation = { + + val r = new Relation + r.setSource(sourceId) + r.setTarget(targetId) + r.setRelType(ModelConstants.RESULT_PROJECT) + r.setRelClass(relClass) + r.setSubRelType(ModelConstants.OUTCOME) + r.setCollectedfrom(List(cf).asJava) + r.setDataInfo(di) + r + + } + + def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { + val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find()) + + if (match_pattern.isDefined) { + val m = match_pattern.get._1 + val p = match_pattern.get._2 + val grantId = m.matcher(awardUri).replaceAll("$2") + val targetId = s"$p${DHPUtils.md5(grantId)}" + List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo)) + } else + List() + + } + + def generateOAF( + input: String, + ts: Long, + dateOfCollection: Long, + vocabularies: VocabularyGroup, + exportLinks: Boolean + ): List[Oaf] = { + if (skip_record(input)) + return List() + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null) + val resourceTypeGeneral = + (json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null) + + //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies + val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (result == null) + return List() + + val pid = (json \ "id").extract[String] + + result.setPid( + List( + OafMapperUtils.structuredProperty( + pid, + "ped", + "ped", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) + ).asJava + ) + result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true)) + result.setOriginalId(List(pid).asJava) + + result.setDataInfo(dataInfo) + + val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) + + val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) + + result.setTitle( + titles + .filter(t => t.title.nonEmpty) + .map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils + .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) + } else { + OafMapperUtils.structuredProperty( + t.title.get, + t.titleType.get, + t.titleType.get, + ModelConstants.DNET_DATACITE_TITLE, + ModelConstants.DNET_DATACITE_TITLE, + null + ) + } + }) + .asJava + ) + + val dates = (json \\ "dates").extract[List[DateType]] + val publication_year = (json \\ "publicationYear").extractOrElse[String](null) + + val i_date = dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .find(d => d.dateType.get.equalsIgnoreCase("issued")) + .map(d => extract_date(d.date.get)) + val a_date: Option[String] = dates + .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) + .map(d => extract_date(d.date.get)) + .find(d => d != null && d.isDefined) + .map(d => d.get) + + if (a_date.isDefined) { + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + } + if (i_date.isDefined && i_date.get.isDefined) { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } else if (publication_year != null) { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result + .getInstance() + .get(0) + .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + + result.setRelevantdate( + dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => + ( + d._1.get, + vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()) + ) + ) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)) + .asJava + ) + + result.setCollectedfrom(List(collectedFromMap("ped")).asJava) + + val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] + + result.setDescription( + descriptions + .filter(d => d.description.isDefined) + .map(d => OafMapperUtils.field(d.description.get, null)) + .filter(s => s != null) + .asJava + ) + + val publisher = (json \\ "publisher").extractOrElse[String](null) + if (publisher != null) + result.setPublisher(OafMapperUtils.field(publisher, null)) + + val language: String = (json \\ "language").extractOrElse[String](null) + + if (language != null) + result.setLanguage( + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language) + ) + + val instance = result.getInstance().get(0) + + val accessRights: List[String] = for { + JObject(rightsList) <- json \\ "rightsList" + JField("rightsUri", JString(rightsUri)) <- rightsList + } yield rightsUri + + val aRights: Option[AccessRight] = accessRights + .map(r => { + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) + }) + .find(q => q != null) + .map(q => { + val a = new AccessRight + a.setClassid(q.getClassid) + a.setClassname(q.getClassname) + a.setSchemeid(q.getSchemeid) + a.setSchemename(q.getSchemename) + a + }) + + val access_rights_qualifier = + if (aRights.isDefined) aRights.get + else + OafMapperUtils.accessRight( + ModelConstants.UNKNOWN, + ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) + + instance.setCollectedfrom(collectedFromMap("ped")) + instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava) + instance.setAccessright(access_rights_qualifier) + instance.setPid(result.getPid) + val license = accessRights + .find(r => + r.startsWith("http") && r.matches( + ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*" + ) + ) + if (license.isDefined) + instance.setLicense(OafMapperUtils.field(license.get, null)) + + val awardUris: List[String] = for { + JObject(fundingReferences) <- json \\ "fundingReferences" + JField("awardUri", JString(awardUri)) <- fundingReferences + } yield awardUri + + result.setId(IdentifierFactory.createIdentifier(result)) + var relations: List[Relation] = + awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + + if (result.getId == null) + return List() + + if (exportLinks) { + val rels: List[RelatedIdentifierType] = for { + JObject(relIdentifier) <- json \\ "relatedIdentifiers" + JField("relationType", JString(relationType)) <- relIdentifier + JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier + JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier + } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) + + relations = relations ::: generateRelations( + rels, + result.getId, + if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null + ) + } + if (relations != null && relations.nonEmpty) { + List(result) ::: relations + } else + List(result) + } + + private def generateRelations( + rels: List[RelatedIdentifierType], + id: String, + date: String + ): List[Relation] = { + rels + .filter(r => + subRelTypeMapping + .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) + ) + .map(r => { + val rel = new Relation + rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) + rel.setDataInfo(dataInfo) + + val subRelType = subRelTypeMapping(r.relationType).relType + rel.setRelType(REL_TYPE_VALUE) + rel.setSubRelType(subRelType) + rel.setRelClass(r.relationType) + + val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) + + rel.setProperties(List(dateProps).asJava) + + rel.setSource(id) + rel.setTarget( + DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + ) + rel.setCollectedfrom(List(collectedFromMap("ped") ).asJava) + rel.getCollectedfrom.asScala.map(c => c.getValue).toList + rel + }) + } + + def generateDSId(input: String): String = { + val b = StringUtils.substringBefore(input, "::") + val a = StringUtils.substringAfter(input, "::") + s"10|$b::${DHPUtils.md5(a)}" + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json new file mode 100644 index 000000000..cc8e5a714 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json @@ -0,0 +1,41 @@ +{ + "id": "PED00001#P38634_A_1", + "types": { + "resourceType": "Protein", + "resourceTypeGeneral": "Dataset" + }, + "creators": [], + "identifiers": [ + { + "identifier": "https://proteinensemble.org/PED00001#P38634_A_1", + "identifierType": "URL" + } + ], + "relatedIdentifiers": [ + { + "relationType": "CitedBy", + "relatedIdentifier": "https://identifiers.org/pubmed:20399186" + }, + { + "relationType": "IsIdenticalTo", + "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634" + } + ], + "alternateIdentifiers": [ + { + "alternateIdentifier": "https://identifiers.org/uniprot:P38634" + } + ], + "descriptions": [], + "titles": [ + { + "title": "Protein SIC1" + } + ], + "dates": [ + { + "date": "2021-12-09T21:10:30", + "dateType": "Collected" + } + ] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala new file mode 100644 index 000000000..f35749fa4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala @@ -0,0 +1,108 @@ +package eu.dnetlib.dhp.bioschema + +import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest +//import eu.dnetlib.dhp.bioschema.{BioschemaToOAFTransformation, GenerateDataciteDatasetSpark} +import eu.dnetlib.dhp.bioschema.BioschemaToOAFTransformation +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.apache.commons.io.FileUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.functions.{col, count} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.extension.ExtendWith +import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} +import org.mockito.junit.jupiter.MockitoExtension +import org.slf4j.{Logger, LoggerFactory} + +import java.nio.file.{Files, Path} +import java.text.SimpleDateFormat +import java.util.Locale +import scala.io.Source + +@ExtendWith(Array(classOf[MockitoExtension])) +class BioschemaDataciteToOAFTest extends AbstractVocabularyTest { + + private var workingDir: Path = null + val log: Logger = LoggerFactory.getLogger(getClass) + + @BeforeEach + def setUp(): Unit = { + + workingDir = Files.createTempDirectory(getClass.getSimpleName) + super.setUpVocabulary() + } + + @AfterEach + def tearDown(): Unit = { + FileUtils.deleteDirectory(workingDir.toFile) + } + + @Test + def testDateMapping: Unit = { + val inputDate = "2021-07-14T11:52:54+0000" + val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) + val dt = ISO8601FORMAT.parse(inputDate) + println(dt.getTime) + + } + +// @Test +// def testConvert(): Unit = { +// +// val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath +// +// val conf = new SparkConf() +// val spark: SparkSession = SparkSession +// .builder() +// .config(conf) +// .appName(getClass.getSimpleName) +// .master("local[*]") +// .getOrCreate() +// +// implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] +// val instance = new GenerateDataciteDatasetSpark(null, null, log) +// val targetPath = s"$workingDir/result" +// +// instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark) +// +// import spark.implicits._ +// +// val nativeSize = spark.read.load(path).count() +// +// assertEquals(100, nativeSize) +// +// val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] +// +// result +// .map(s => s.getClass.getSimpleName) +// .groupBy(col("value").alias("class")) +// .agg(count("value").alias("Total")) +// .show(false) +// +// val t = spark.read.load(targetPath).count() +// +// assertTrue(t > 0) +// +// spark.stop() +// +// } + + @Test + def testMapping(): Unit = { + val record = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json")) + .mkString + + val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) + val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true) + + res.foreach(r => { + println(mapper.writeValueAsString(r)) + println("----------------------------") + + }) + + } + +} diff --git a/dhp-workflows/dhp-bmuse/sitemap.txt b/dhp-workflows/dhp-bmuse/sitemap.txt new file mode 100644 index 000000000..d8ed5ebe1 --- /dev/null +++ b/dhp-workflows/dhp-bmuse/sitemap.txt @@ -0,0 +1,62 @@ +https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612 + +PED + + workingPath + /data/bioschema/ped/ + the working path + + + sitemapUrl + https://proteinensemble.org/sitemap2.xml.gz + + + sitemapURLKey + loc + + + dynamic + true + the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) + + +DISPROT + + workingPath + /data/bioschema/disprot/ + the working path + + + sitemapUrl + https://disprot.org/sitemap2.xml.gz + + + sitemapURLKey + loc + + + dynamic + true + the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) + + +MOBIDB + + workingPath + /data/bioschema/mobidb/ + the working path + + + sitemapUrl + https://mobidb.org/sitemap2.xml.gz + + + sitemapURLKey + loc + + + dynamic + true + the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) + +