resolution of generated relations url to uniprot and pubmed datasources

This commit is contained in:
Enrico Ottonello 2022-02-24 16:59:50 +01:00
parent 4975278558
commit 2f5caef77b
3 changed files with 66 additions and 184 deletions

View File

@ -10,17 +10,17 @@ import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/** This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
///** This class represent the dataModel of the input Dataset of Bioschema Datacite
// * @param doi THE DOI
// * @param timestamp timestamp of last update date
// * @param isActive the record is active or deleted
// * @param json the json native records
// */
//case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
bioschema json datacite to OAF Schema
*/
case class RelatedIdentifierType(
relationType: String,
@ -68,10 +68,9 @@ object BioschemaModelConstants {
val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val dataInfo: DataInfo = bioschemaDataInfo("0.9")
val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(
@ -211,13 +210,7 @@ object BioschemaModelConstants {
)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream != null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
def bioschemaDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
@ -234,23 +227,6 @@ object BioschemaModelConstants {
val df_it: DateTimeFormatter =
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile(

View File

@ -1,9 +1,8 @@
package eu.dnetlib.dhp.bioschema
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
@ -13,13 +12,8 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
object BioschemaToOAFTransformation {
@ -34,9 +28,15 @@ object BioschemaToOAFTransformation {
"0.9"
)
val resolvedURL: Map[String, String] = Map(
"uniprot" -> "https://www.uniprot.org/uniprot/",
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
)
val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|ped_________::changeme",
//TODO create pedDatasourceId and update this value
"10|ped_________::pedDatasourceId",
"PED"
)
PEDCollectedFrom.setDataInfo(DATA_INFO)
@ -46,59 +46,6 @@ object BioschemaToOAFTransformation {
)
}
/** This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
* @return True if the record should be skipped
*/
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex
.map(pattern => {
@ -127,16 +74,6 @@ object BioschemaToOAFTransformation {
d
}
def fix_thai_date(input: String, format: String): String = {
try {
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
@ -197,7 +134,15 @@ object BioschemaToOAFTransformation {
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
// i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
@ -261,20 +206,6 @@ object BioschemaToOAFTransformation {
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
} else
List()
}
def generateOAF(
input: String,
ts: Long,
@ -282,8 +213,6 @@ object BioschemaToOAFTransformation {
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
if (skip_record(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -410,56 +339,12 @@ object BioschemaToOAFTransformation {
val instance = result.getInstance().get(0)
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
})
.find(q => q != null)
.map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier =
if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
instance.setCollectedfrom(collectedFromMap("ped"))
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
var relations: List[Relation] = List()
if (result.getId == null)
return List()
@ -475,8 +360,22 @@ object BioschemaToOAFTransformation {
relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
)
val identifiers: List[RelatedIdentifierType] = for {
JObject(alternateIdentifier) <- json \\ "alternateIdentifiers"
JField("alternateIdentifier", JString(alternateIdentifierValue)) <- alternateIdentifier
} yield RelatedIdentifierType("IsIdenticalTo", alternateIdentifierValue, "URL")
relations = relations ::: generateRelations(
identifiers,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
@ -487,15 +386,10 @@ object BioschemaToOAFTransformation {
private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String
date: String,
pid: String
): List[Relation] = {
rels
.filter(r =>
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
@ -510,10 +404,20 @@ object BioschemaToOAFTransformation {
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
val foundResolvedURLId = resolvedURL.map(k => {
if (r.relatedIdentifier.contains(s"${k._1}:"))
k._1
else
null
}).find(s => s != null);
if (foundResolvedURLId.nonEmpty) {
val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:")
rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}")
} else
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setSource(id)
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel

View File

@ -13,12 +13,14 @@
],
"relatedIdentifiers": [
{
"relationType": "CitedBy",
"relatedIdentifier": "https://identifiers.org/pubmed:20399186"
"relationType": "IsCitedBy",
"relatedIdentifier": "https://identifiers.org/pubmed:20399186",
"relatedIdentifierType": "URL"
},
{
"relationType": "IsIdenticalTo",
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634",
"relatedIdentifierType": "URL"
}
],
"alternateIdentifiers": [