forked from D-Net/dnet-hadoop
resolution of generated relations url to uniprot and pubmed datasources
This commit is contained in:
parent
4975278558
commit
2f5caef77b
|
@ -10,17 +10,17 @@ import java.util.Locale
|
|||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/** This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
///** This class represent the dataModel of the input Dataset of Bioschema Datacite
|
||||
// * @param doi THE DOI
|
||||
// * @param timestamp timestamp of last update date
|
||||
// * @param isActive the record is active or deleted
|
||||
// * @param json the json native records
|
||||
// */
|
||||
//case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
bioschema json datacite to OAF Schema
|
||||
*/
|
||||
case class RelatedIdentifierType(
|
||||
relationType: String,
|
||||
|
@ -68,10 +68,9 @@ object BioschemaModelConstants {
|
|||
|
||||
val REL_TYPE_VALUE: String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
val dataInfo: DataInfo = bioschemaDataInfo("0.9")
|
||||
|
||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(
|
||||
|
@ -211,13 +210,7 @@ object BioschemaModelConstants {
|
|||
)
|
||||
)
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream != null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
def bioschemaDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
|
@ -234,23 +227,6 @@ object BioschemaModelConstants {
|
|||
val df_it: DateTimeFormatter =
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda__h2020::"
|
||||
),
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda_______::"
|
||||
)
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile(
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
package eu.dnetlib.dhp.bioschema
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
||||
|
@ -13,13 +12,8 @@ import org.json4s.DefaultFormats
|
|||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
|
||||
object BioschemaToOAFTransformation {
|
||||
|
||||
|
@ -34,9 +28,15 @@ object BioschemaToOAFTransformation {
|
|||
"0.9"
|
||||
)
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"uniprot" -> "https://www.uniprot.org/uniprot/",
|
||||
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
|
||||
)
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|ped_________::changeme",
|
||||
//TODO create pedDatasourceId and update this value
|
||||
"10|ped_________::pedDatasourceId",
|
||||
"PED"
|
||||
)
|
||||
PEDCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
@ -46,59 +46,6 @@ object BioschemaToOAFTransformation {
|
|||
)
|
||||
}
|
||||
|
||||
/** This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
def skip_record(json: String): Boolean = {
|
||||
datacite_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
@deprecated("this method will be removed", "dhp")
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex
|
||||
.map(pattern => {
|
||||
|
@ -127,16 +74,6 @@ object BioschemaToOAFTransformation {
|
|||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input: String, format: String): String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
|
@ -197,7 +134,15 @@ object BioschemaToOAFTransformation {
|
|||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
// i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
|
@ -261,20 +206,6 @@ object BioschemaToOAFTransformation {
|
|||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
|
||||
} else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
def generateOAF(
|
||||
input: String,
|
||||
ts: Long,
|
||||
|
@ -282,8 +213,6 @@ object BioschemaToOAFTransformation {
|
|||
vocabularies: VocabularyGroup,
|
||||
exportLinks: Boolean
|
||||
): List[Oaf] = {
|
||||
if (skip_record(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
@ -410,73 +339,43 @@ object BioschemaToOAFTransformation {
|
|||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights
|
||||
.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
})
|
||||
.find(q => q != null)
|
||||
.map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
val access_rights_qualifier =
|
||||
if (aRights.isDefined) aRights.get
|
||||
else
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
|
||||
instance.setCollectedfrom(collectedFromMap("ped"))
|
||||
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r =>
|
||||
r.startsWith("http") && r.matches(
|
||||
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
||||
)
|
||||
)
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] =
|
||||
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
var relations: List[Relation] = List()
|
||||
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(
|
||||
rels,
|
||||
result.getId,
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
|
||||
pid
|
||||
)
|
||||
|
||||
val identifiers: List[RelatedIdentifierType] = for {
|
||||
JObject(alternateIdentifier) <- json \\ "alternateIdentifiers"
|
||||
JField("alternateIdentifier", JString(alternateIdentifierValue)) <- alternateIdentifier
|
||||
} yield RelatedIdentifierType("IsIdenticalTo", alternateIdentifierValue, "URL")
|
||||
|
||||
relations = relations ::: generateRelations(
|
||||
identifiers,
|
||||
result.getId,
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
|
||||
pid
|
||||
)
|
||||
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
|
@ -487,15 +386,10 @@ object BioschemaToOAFTransformation {
|
|||
private def generateRelations(
|
||||
rels: List[RelatedIdentifierType],
|
||||
id: String,
|
||||
date: String
|
||||
date: String,
|
||||
pid: String
|
||||
): List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping
|
||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
||||
|
@ -510,10 +404,20 @@ object BioschemaToOAFTransformation {
|
|||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
val foundResolvedURLId = resolvedURL.map(k => {
|
||||
if (r.relatedIdentifier.contains(s"${k._1}:"))
|
||||
k._1
|
||||
else
|
||||
null
|
||||
}).find(s => s != null);
|
||||
if (foundResolvedURLId.nonEmpty) {
|
||||
val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:")
|
||||
rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}")
|
||||
} else
|
||||
rel.setTarget(
|
||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||
)
|
||||
rel.setSource(id)
|
||||
rel.setTarget(
|
||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||
)
|
||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||
rel
|
||||
|
|
|
@ -13,12 +13,14 @@
|
|||
],
|
||||
"relatedIdentifiers": [
|
||||
{
|
||||
"relationType": "CitedBy",
|
||||
"relatedIdentifier": "https://identifiers.org/pubmed:20399186"
|
||||
"relationType": "IsCitedBy",
|
||||
"relatedIdentifier": "https://identifiers.org/pubmed:20399186",
|
||||
"relatedIdentifierType": "URL"
|
||||
},
|
||||
{
|
||||
"relationType": "IsIdenticalTo",
|
||||
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
|
||||
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634",
|
||||
"relatedIdentifierType": "URL"
|
||||
}
|
||||
],
|
||||
"alternateIdentifiers": [
|
||||
|
|
Loading…
Reference in New Issue