resolution of generated relations url to uniprot and pubmed datasources

This commit is contained in:
Enrico Ottonello 2022-02-24 16:59:50 +01:00
parent 4975278558
commit 2f5caef77b
3 changed files with 66 additions and 184 deletions

View File

@ -10,17 +10,17 @@ import java.util.Locale
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.io.Source import scala.io.Source
/** This class represent the dataModel of the input Dataset of Datacite ///** This class represent the dataModel of the input Dataset of Bioschema Datacite
* @param doi THE DOI // * @param doi THE DOI
* @param timestamp timestamp of last update date // * @param timestamp timestamp of last update date
* @param isActive the record is active or deleted // * @param isActive the record is active or deleted
* @param json the json native records // * @param json the json native records
*/ // */
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} //case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/* /*
The following class are utility class used for the mapping from The following class are utility class used for the mapping from
json datacite to OAF Shema bioschema json datacite to OAF Schema
*/ */
case class RelatedIdentifierType( case class RelatedIdentifierType(
relationType: String, relationType: String,
@ -68,10 +68,9 @@ object BioschemaModelConstants {
val REL_TYPE_VALUE: String = "resultResult" val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate" val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi" val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
val dataInfo: DataInfo = dataciteDataInfo("0.9") val dataInfo: DataInfo = bioschemaDataInfo("0.9")
val subRelTypeMapping: Map[String, OAFRelations] = Map( val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations( ModelConstants.REFERENCES -> OAFRelations(
@ -211,13 +210,7 @@ object BioschemaModelConstants {
) )
) )
val datacite_filter: List[String] = { def bioschemaDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream != null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false, false,
null, null,
false, false,
@ -234,23 +227,6 @@ object BioschemaModelConstants {
val df_it: DateTimeFormatter = val df_it: DateTimeFormatter =
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
)
val Date_regex: List[Pattern] = List( val Date_regex: List[Pattern] = List(
//Y-M-D //Y-M-D
Pattern.compile( Pattern.compile(

View File

@ -1,9 +1,8 @@
package eu.dnetlib.dhp.bioschema package eu.dnetlib.dhp.bioschema
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._ import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
@ -13,13 +12,8 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import java.text.SimpleDateFormat
import java.time.LocalDate import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
object BioschemaToOAFTransformation { object BioschemaToOAFTransformation {
@ -34,9 +28,15 @@ object BioschemaToOAFTransformation {
"0.9" "0.9"
) )
val resolvedURL: Map[String, String] = Map(
"uniprot" -> "https://www.uniprot.org/uniprot/",
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
)
val collectedFromMap: Map[String, KeyValue] = { val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue( val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|ped_________::changeme", //TODO create pedDatasourceId and update this value
"10|ped_________::pedDatasourceId",
"PED" "PED"
) )
PEDCollectedFrom.setDataInfo(DATA_INFO) PEDCollectedFrom.setDataInfo(DATA_INFO)
@ -46,59 +46,6 @@ object BioschemaToOAFTransformation {
) )
} }
/** This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
* @return True if the record should be skipped
*/
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = { def extract_date(input: String): Option[String] = {
val d = Date_regex val d = Date_regex
.map(pattern => { .map(pattern => {
@ -127,16 +74,6 @@ object BioschemaToOAFTransformation {
d d
} }
def fix_thai_date(input: String, format: String): String = {
try {
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier( def getTypeQualifier(
resourceType: String, resourceType: String,
resourceTypeGeneral: String, resourceTypeGeneral: String,
@ -197,7 +134,15 @@ object BioschemaToOAFTransformation {
if (typeQualifiers == null) if (typeQualifiers == null)
return null return null
val i = new Instance val i = new Instance
i.setInstancetype(typeQualifiers._1) i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
// i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match { typeQualifiers._2.getClassname match {
case "dataset" => case "dataset" =>
val r = new OafDataset val r = new OafDataset
@ -261,20 +206,6 @@ object BioschemaToOAFTransformation {
} }
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
} else
List()
}
def generateOAF( def generateOAF(
input: String, input: String,
ts: Long, ts: Long,
@ -282,8 +213,6 @@ object BioschemaToOAFTransformation {
vocabularies: VocabularyGroup, vocabularies: VocabularyGroup,
exportLinks: Boolean exportLinks: Boolean
): List[Oaf] = { ): List[Oaf] = {
if (skip_record(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
@ -410,73 +339,43 @@ object BioschemaToOAFTransformation {
val instance = result.getInstance().get(0) val instance = result.getInstance().get(0)
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
})
.find(q => q != null)
.map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier =
if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
instance.setCollectedfrom(collectedFromMap("ped")) instance.setCollectedfrom(collectedFromMap("ped"))
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava) instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid) instance.setPid(result.getPid)
val license = accessRights
.find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result)) result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = var relations: List[Relation] = List()
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
if (result.getId == null) if (result.getId == null)
return List() return List()
if (exportLinks) { if (exportLinks) {
val rels: List[RelatedIdentifierType] = for { val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers" JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations( relations = relations ::: generateRelations(
rels, rels,
result.getId, result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
) )
val identifiers: List[RelatedIdentifierType] = for {
JObject(alternateIdentifier) <- json \\ "alternateIdentifiers"
JField("alternateIdentifier", JString(alternateIdentifierValue)) <- alternateIdentifier
} yield RelatedIdentifierType("IsIdenticalTo", alternateIdentifierValue, "URL")
relations = relations ::: generateRelations(
identifiers,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
)
} }
if (relations != null && relations.nonEmpty) { if (relations != null && relations.nonEmpty) {
List(result) ::: relations List(result) ::: relations
@ -487,15 +386,10 @@ object BioschemaToOAFTransformation {
private def generateRelations( private def generateRelations(
rels: List[RelatedIdentifierType], rels: List[RelatedIdentifierType],
id: String, id: String,
date: String date: String,
pid: String
): List[Relation] = { ): List[Relation] = {
rels rels
.filter(r =>
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => { .map(r => {
val rel = new Relation val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
@ -510,10 +404,20 @@ object BioschemaToOAFTransformation {
rel.setProperties(List(dateProps).asJava) rel.setProperties(List(dateProps).asJava)
val foundResolvedURLId = resolvedURL.map(k => {
if (r.relatedIdentifier.contains(s"${k._1}:"))
k._1
else
null
}).find(s => s != null);
if (foundResolvedURLId.nonEmpty) {
val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:")
rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}")
} else
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setSource(id) rel.setSource(id)
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel rel

View File

@ -13,12 +13,14 @@
], ],
"relatedIdentifiers": [ "relatedIdentifiers": [
{ {
"relationType": "CitedBy", "relationType": "IsCitedBy",
"relatedIdentifier": "https://identifiers.org/pubmed:20399186" "relatedIdentifier": "https://identifiers.org/pubmed:20399186",
"relatedIdentifierType": "URL"
}, },
{ {
"relationType": "IsIdenticalTo", "relationType": "IsIdenticalTo",
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634" "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634",
"relatedIdentifierType": "URL"
} }
], ],
"alternateIdentifiers": [ "alternateIdentifiers": [