added datasource key to workflow parameter to properly choose collected from and id values

This commit is contained in:
Enrico Ottonello 2022-03-03 12:31:29 +01:00
parent e57216a1fa
commit 29ee1b9d82
6 changed files with 67 additions and 76 deletions

View File

@ -34,6 +34,11 @@
"paramLongName": "exportLinks",
"paramDescription": "should export also links",
"paramRequired": false
},
{
"paramName": "k",
"paramLongName": "datasourceKey",
"paramDescription": "the key that identifies the datasource",
"paramRequired": true
}
]

View File

@ -5,6 +5,11 @@
<value>/data/bioschema/ped</value>
<description>the working path of Bioschema stores</description>
</property>
<property>
<name>datasourceKey</name>
<value>ped</value>
<description>the key that identifies the datasource (eg ped, disprot)</description>
</property>
</parameters>
<start to="TransformJob"/>
@ -51,6 +56,7 @@
<arg>--sourcePath</arg><arg>${mainPath}/json-datacite</arg>
<arg>--targetPath</arg><arg>${mainPath}/dataset</arg>
<arg>--exportLinks</arg><arg>true</arg>
<arg>--datasourceKey</arg><arg>${datasourceKey}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.bioschema
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.DataInfo
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import java.time.format.DateTimeFormatter
@ -61,6 +61,39 @@ class BioschemaModelConstants extends Serializable {}
object BioschemaModelConstants {
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val PED_PREFIX: String = "ped_________"
val resolvedURL: Map[String, String] = Map(
"uniprot" -> "https://www.uniprot.org/uniprot/",
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
)
val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
//TODO create pedDatasourceId and update this value
"10|ped_________::pedDatasourceId",
"Protein Ensemble Database"
)
PEDCollectedFrom.setDataInfo(DATA_INFO)
Map(
"ped" -> PEDCollectedFrom
)
}
val datasourceKeyPrefix: Map[String, String] = Map(
"ped" -> PED_PREFIX
)
val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val dataInfo: DataInfo = bioschemaDataInfo("0.9")

View File

@ -18,33 +18,6 @@ object BioschemaToOAFTransformation {
val mapper = new ObjectMapper()
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val resolvedURL: Map[String, String] = Map(
"uniprot" -> "https://www.uniprot.org/uniprot/",
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
)
val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
//TODO create pedDatasourceId and update this value
"10|ped_________::pedDatasourceId",
"Protein Ensemble Database"
)
PEDCollectedFrom.setDataInfo(DATA_INFO)
Map(
"ped" -> PEDCollectedFrom
)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex
.map(pattern => {
@ -89,38 +62,14 @@ object BioschemaToOAFTransformation {
null
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def generateOAF(
input: String,
exportLinks: Boolean
exportLinks: Boolean,
datasourceKey: String
): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -142,15 +91,15 @@ object BioschemaToOAFTransformation {
List(
OafMapperUtils.structuredProperty(
pid,
"ped",
"ped",
datasourceKey,
datasourceKey,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
result.setId(OafMapperUtils.createOpenaireId(50, s"${datasourceKeyPrefix(datasourceKey)}::$pid", true))
result.setOriginalId(List(pid).asJava)
result.setDataInfo(dataInfo)
@ -215,7 +164,7 @@ object BioschemaToOAFTransformation {
.asJava
)
result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
result.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
@ -246,7 +195,7 @@ object BioschemaToOAFTransformation {
})
.asJava
)
instance.setCollectedfrom(collectedFromMap("ped"))
instance.setCollectedfrom(collectedFromMap(datasourceKey))
instance.setPid(result.getPid)
result.setId(IdentifierFactory.createIdentifier(result))
@ -294,7 +243,8 @@ object BioschemaToOAFTransformation {
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
pid,
datasourceKey
)
}
@ -308,12 +258,13 @@ object BioschemaToOAFTransformation {
rels: List[RelatedIdentifierType],
id: String,
date: String,
pid: String
pid: String,
datasourceKey: String
): List[Relation] = {
rels
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType).relType
@ -341,16 +292,9 @@ object BioschemaToOAFTransformation {
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setSource(id)
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
})
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -22,6 +22,8 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
log.info(s"SourcePath is '$sourcePath'")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
log.info(s"exportLinks is '$exportLinks'")
val datasourceKey = parser.get("datasourceKey").toLowerCase
log.info(s"datasourceKey is '$datasourceKey'")
// val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
// log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
@ -34,7 +36,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
val targetPath = parser.get("targetPath")
generateBioschemaDataset(sourcePath, exportLinks, targetPath, spark)
generateBioschemaDataset(sourcePath, exportLinks, targetPath, datasourceKey, spark)
// reportTotalSize(targetPath, outputBasePath)
}
@ -63,6 +65,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
sourcePath: String,
exportLinks: Boolean,
targetPath: String,
datasourceKey: String,
spark: SparkSession
): Unit = {
require(spark != null)
@ -72,7 +75,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
spark.createDataset(
spark.sparkContext
.textFile(sourcePath)
.flatMap(i => BioschemaToOAFTransformation.generateOAF(i, exportLinks))
.flatMap(i => BioschemaToOAFTransformation.generateOAF(i, exportLinks, datasourceKey))
),
targetPath
)

View File

@ -46,13 +46,13 @@ class BioschemaDataciteToOAFTest {
val instance = new GenerateBioschemaDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
instance.generateBioschemaDataset(path, exportLinks = true, targetPath, spark)
instance.generateBioschemaDataset(path, exportLinks = true, targetPath, "ped", spark)
val total_items = spark.read.text(targetPath).count()
println(s"total_items: $total_items")
assertTrue(total_items == 50)
instance.generateBioschemaDataset(path, exportLinks = false, targetPath, spark)
instance.generateBioschemaDataset(path, exportLinks = false, targetPath, "ped", spark)
val total_datasets = spark.read.text(targetPath).count()
println(s"total_datasets: $total_datasets")
@ -67,7 +67,7 @@ class BioschemaDataciteToOAFTest {
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
.mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, true)
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, true, "ped")
res.foreach(r => {
println(mapper.writeValueAsString(r))
println("----------------------------")