From 2f5caef77b7abd7acabffbc5cc3a9c9ec77b519f Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 24 Feb 2022 16:59:50 +0100 Subject: [PATCH] resolution of generated relations url to uniprot and pubmed datasources --- .../bioschema/BioschemaModelConstants.scala | 44 +--- .../BioschemaToOAFTransformation.scala | 198 +++++------------- .../eu/dnetlib/dhp/bioschema/ped_record.json | 8 +- 3 files changed, 66 insertions(+), 184 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala index d44693f5a..48009b936 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala @@ -10,17 +10,17 @@ import java.util.Locale import java.util.regex.Pattern import scala.io.Source -/** This class represent the dataModel of the input Dataset of Datacite - * @param doi THE DOI - * @param timestamp timestamp of last update date - * @param isActive the record is active or deleted - * @param json the json native records - */ -case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} +///** This class represent the dataModel of the input Dataset of Bioschema Datacite +// * @param doi THE DOI +// * @param timestamp timestamp of last update date +// * @param isActive the record is active or deleted +// * @param json the json native records +// */ +//case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} /* The following class are utility class used for the mapping from - json datacite to OAF Shema + bioschema json datacite to OAF Schema */ case class RelatedIdentifierType( relationType: String, @@ -68,10 +68,9 @@ object BioschemaModelConstants { val REL_TYPE_VALUE: String = "resultResult" val DATE_RELATION_KEY = "RelationDate" - val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter" val DOI_CLASS = "doi" val SUBJ_CLASS = "keywords" - val dataInfo: DataInfo = dataciteDataInfo("0.9") + val dataInfo: DataInfo = bioschemaDataInfo("0.9") val subRelTypeMapping: Map[String, OAFRelations] = Map( ModelConstants.REFERENCES -> OAFRelations( @@ -211,13 +210,7 @@ object BioschemaModelConstants { ) ) - val datacite_filter: List[String] = { - val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH) - require(stream != null) - Source.fromInputStream(stream).getLines().toList - } - - def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( + def bioschemaDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( false, null, false, @@ -234,23 +227,6 @@ object BioschemaModelConstants { val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) - val funder_regex: List[(Pattern, String)] = List( - ( - Pattern.compile( - "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", - Pattern.MULTILINE | Pattern.CASE_INSENSITIVE - ), - "40|corda__h2020::" - ), - ( - Pattern.compile( - "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", - Pattern.MULTILINE | Pattern.CASE_INSENSITIVE - ), - "40|corda_______::" - ) - ) - val Date_regex: List[Pattern] = List( //Y-M-D Pattern.compile( diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala index 88fd70107..ac8bf10ec 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.bioschema import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._ -import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} @@ -13,13 +12,8 @@ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse -import java.text.SimpleDateFormat import java.time.LocalDate -import java.time.chrono.ThaiBuddhistDate -import java.time.format.DateTimeFormatter -import java.util.{Date, Locale} import scala.collection.JavaConverters._ -import scala.io.{Codec, Source} object BioschemaToOAFTransformation { @@ -34,9 +28,15 @@ object BioschemaToOAFTransformation { "0.9" ) + val resolvedURL: Map[String, String] = Map( + "uniprot" -> "https://www.uniprot.org/uniprot/", + "pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/" + ) + val collectedFromMap: Map[String, KeyValue] = { val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue( - "10|ped_________::changeme", + //TODO create pedDatasourceId and update this value + "10|ped_________::pedDatasourceId", "PED" ) PEDCollectedFrom.setDataInfo(DATA_INFO) @@ -46,59 +46,6 @@ object BioschemaToOAFTransformation { ) } - /** This method should skip record if json contains invalid text - * defined in gile datacite_filter - * - * @param json - * @return True if the record should be skipped - */ - def skip_record(json: String): Boolean = { - datacite_filter.exists(f => json.contains(f)) - } - - @deprecated("this method will be removed", "dhp") - def toActionSet(item: Oaf): (String, String) = { - val mapper = new ObjectMapper() - - item match { - case dataset: OafDataset => - val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] - a.setClazz(classOf[OafDataset]) - a.setPayload(dataset) - (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case publication: Publication => - val a: AtomicAction[Publication] = new AtomicAction[Publication] - a.setClazz(classOf[Publication]) - a.setPayload(publication) - (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case software: Software => - val a: AtomicAction[Software] = new AtomicAction[Software] - a.setClazz(classOf[Software]) - a.setPayload(software) - (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case orp: OtherResearchProduct => - val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] - a.setClazz(classOf[OtherResearchProduct]) - a.setPayload(orp) - (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) - - case relation: Relation => - val a: AtomicAction[Relation] = new AtomicAction[Relation] - a.setClazz(classOf[Relation]) - a.setPayload(relation) - (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case _ => - null - } - - } - - def embargo_end(embargo_end_date: String): Boolean = { - val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) - val td = LocalDate.now() - td.isAfter(dt) - } - def extract_date(input: String): Option[String] = { val d = Date_regex .map(pattern => { @@ -127,16 +74,6 @@ object BioschemaToOAFTransformation { d } - def fix_thai_date(input: String, format: String): String = { - try { - val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format)) - val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth) - LocalDate.from(d).toString - } catch { - case _: Throwable => "" - } - } - def getTypeQualifier( resourceType: String, resourceTypeGeneral: String, @@ -197,7 +134,15 @@ object BioschemaToOAFTransformation { if (typeQualifiers == null) return null val i = new Instance - i.setInstancetype(typeQualifiers._1) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) +// i.setInstancetype(typeQualifiers._1) typeQualifiers._2.getClassname match { case "dataset" => val r = new OafDataset @@ -261,20 +206,6 @@ object BioschemaToOAFTransformation { } - def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { - val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find()) - - if (match_pattern.isDefined) { - val m = match_pattern.get._1 - val p = match_pattern.get._2 - val grantId = m.matcher(awardUri).replaceAll("$2") - val targetId = s"$p${DHPUtils.md5(grantId)}" - List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo)) - } else - List() - - } - def generateOAF( input: String, ts: Long, @@ -282,8 +213,6 @@ object BioschemaToOAFTransformation { vocabularies: VocabularyGroup, exportLinks: Boolean ): List[Oaf] = { - if (skip_record(input)) - return List() implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -410,73 +339,43 @@ object BioschemaToOAFTransformation { val instance = result.getInstance().get(0) - val accessRights: List[String] = for { - JObject(rightsList) <- json \\ "rightsList" - JField("rightsUri", JString(rightsUri)) <- rightsList - } yield rightsUri - - val aRights: Option[AccessRight] = accessRights - .map(r => { - vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) - }) - .find(q => q != null) - .map(q => { - val a = new AccessRight - a.setClassid(q.getClassid) - a.setClassname(q.getClassname) - a.setSchemeid(q.getSchemeid) - a.setSchemename(q.getSchemename) - a - }) - - val access_rights_qualifier = - if (aRights.isDefined) aRights.get - else - OafMapperUtils.accessRight( - ModelConstants.UNKNOWN, - ModelConstants.NOT_AVAILABLE, - ModelConstants.DNET_ACCESS_MODES, - ModelConstants.DNET_ACCESS_MODES - ) - instance.setCollectedfrom(collectedFromMap("ped")) instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava) - instance.setAccessright(access_rights_qualifier) instance.setPid(result.getPid) - val license = accessRights - .find(r => - r.startsWith("http") && r.matches( - ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*" - ) - ) - if (license.isDefined) - instance.setLicense(OafMapperUtils.field(license.get, null)) - - val awardUris: List[String] = for { - JObject(fundingReferences) <- json \\ "fundingReferences" - JField("awardUri", JString(awardUri)) <- fundingReferences - } yield awardUri result.setId(IdentifierFactory.createIdentifier(result)) - var relations: List[Relation] = - awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + var relations: List[Relation] = List() if (result.getId == null) return List() if (exportLinks) { val rels: List[RelatedIdentifierType] = for { - JObject(relIdentifier) <- json \\ "relatedIdentifiers" - JField("relationType", JString(relationType)) <- relIdentifier + JObject(relIdentifier) <- json \\ "relatedIdentifiers" + JField("relationType", JString(relationType)) <- relIdentifier JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier - JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier + JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) relations = relations ::: generateRelations( rels, result.getId, - if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null + if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null, + pid ) + + val identifiers: List[RelatedIdentifierType] = for { + JObject(alternateIdentifier) <- json \\ "alternateIdentifiers" + JField("alternateIdentifier", JString(alternateIdentifierValue)) <- alternateIdentifier + } yield RelatedIdentifierType("IsIdenticalTo", alternateIdentifierValue, "URL") + + relations = relations ::: generateRelations( + identifiers, + result.getId, + if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null, + pid + ) + } if (relations != null && relations.nonEmpty) { List(result) ::: relations @@ -487,15 +386,10 @@ object BioschemaToOAFTransformation { private def generateRelations( rels: List[RelatedIdentifierType], id: String, - date: String + date: String, + pid: String ): List[Relation] = { rels - .filter(r => - subRelTypeMapping - .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || - r.relatedIdentifierType.equalsIgnoreCase("pmid") || - r.relatedIdentifierType.equalsIgnoreCase("arxiv")) - ) .map(r => { val rel = new Relation rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) @@ -510,10 +404,20 @@ object BioschemaToOAFTransformation { rel.setProperties(List(dateProps).asJava) + val foundResolvedURLId = resolvedURL.map(k => { + if (r.relatedIdentifier.contains(s"${k._1}:")) + k._1 + else + null + }).find(s => s != null); + if (foundResolvedURLId.nonEmpty) { + val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:") + rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}") + } else + rel.setTarget( + DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + ) rel.setSource(id) - rel.setTarget( - DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) - ) rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json index cc8e5a714..836acc0c6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json @@ -13,12 +13,14 @@ ], "relatedIdentifiers": [ { - "relationType": "CitedBy", - "relatedIdentifier": "https://identifiers.org/pubmed:20399186" + "relationType": "IsCitedBy", + "relatedIdentifier": "https://identifiers.org/pubmed:20399186", + "relatedIdentifierType": "URL" }, { "relationType": "IsIdenticalTo", - "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634" + "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634", + "relatedIdentifierType": "URL" } ], "alternateIdentifiers": [