forked from D-Net/dnet-hadoop
added datasource key to workflow parameter to properly choose collected from and id values
This commit is contained in:
parent
e57216a1fa
commit
29ee1b9d82
|
@ -34,6 +34,11 @@
|
||||||
"paramLongName": "exportLinks",
|
"paramLongName": "exportLinks",
|
||||||
"paramDescription": "should export also links",
|
"paramDescription": "should export also links",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "k",
|
||||||
|
"paramLongName": "datasourceKey",
|
||||||
|
"paramDescription": "the key that identifies the datasource",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
|
@ -5,6 +5,11 @@
|
||||||
<value>/data/bioschema/ped</value>
|
<value>/data/bioschema/ped</value>
|
||||||
<description>the working path of Bioschema stores</description>
|
<description>the working path of Bioschema stores</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>datasourceKey</name>
|
||||||
|
<value>ped</value>
|
||||||
|
<description>the key that identifies the datasource (eg ped, disprot)</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="TransformJob"/>
|
<start to="TransformJob"/>
|
||||||
|
@ -51,6 +56,7 @@
|
||||||
<arg>--sourcePath</arg><arg>${mainPath}/json-datacite</arg>
|
<arg>--sourcePath</arg><arg>${mainPath}/json-datacite</arg>
|
||||||
<arg>--targetPath</arg><arg>${mainPath}/dataset</arg>
|
<arg>--targetPath</arg><arg>${mainPath}/dataset</arg>
|
||||||
<arg>--exportLinks</arg><arg>true</arg>
|
<arg>--exportLinks</arg><arg>true</arg>
|
||||||
|
<arg>--datasourceKey</arg><arg>${datasourceKey}</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package eu.dnetlib.dhp.bioschema
|
package eu.dnetlib.dhp.bioschema
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo
|
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||||
|
|
||||||
import java.time.format.DateTimeFormatter
|
import java.time.format.DateTimeFormatter
|
||||||
|
@ -61,6 +61,39 @@ class BioschemaModelConstants extends Serializable {}
|
||||||
|
|
||||||
object BioschemaModelConstants {
|
object BioschemaModelConstants {
|
||||||
|
|
||||||
|
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||||
|
"0.9"
|
||||||
|
)
|
||||||
|
|
||||||
|
val PED_PREFIX: String = "ped_________"
|
||||||
|
|
||||||
|
val resolvedURL: Map[String, String] = Map(
|
||||||
|
"uniprot" -> "https://www.uniprot.org/uniprot/",
|
||||||
|
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
|
||||||
|
)
|
||||||
|
|
||||||
|
val collectedFromMap: Map[String, KeyValue] = {
|
||||||
|
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
|
//TODO create pedDatasourceId and update this value
|
||||||
|
"10|ped_________::pedDatasourceId",
|
||||||
|
"Protein Ensemble Database"
|
||||||
|
)
|
||||||
|
PEDCollectedFrom.setDataInfo(DATA_INFO)
|
||||||
|
|
||||||
|
Map(
|
||||||
|
"ped" -> PEDCollectedFrom
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
val datasourceKeyPrefix: Map[String, String] = Map(
|
||||||
|
"ped" -> PED_PREFIX
|
||||||
|
)
|
||||||
|
|
||||||
val REL_TYPE_VALUE: String = "resultResult"
|
val REL_TYPE_VALUE: String = "resultResult"
|
||||||
val DATE_RELATION_KEY = "RelationDate"
|
val DATE_RELATION_KEY = "RelationDate"
|
||||||
val dataInfo: DataInfo = bioschemaDataInfo("0.9")
|
val dataInfo: DataInfo = bioschemaDataInfo("0.9")
|
||||||
|
|
|
@ -18,33 +18,6 @@ object BioschemaToOAFTransformation {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
|
||||||
false,
|
|
||||||
null,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
|
||||||
"0.9"
|
|
||||||
)
|
|
||||||
|
|
||||||
val resolvedURL: Map[String, String] = Map(
|
|
||||||
"uniprot" -> "https://www.uniprot.org/uniprot/",
|
|
||||||
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
|
|
||||||
)
|
|
||||||
|
|
||||||
val collectedFromMap: Map[String, KeyValue] = {
|
|
||||||
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
||||||
//TODO create pedDatasourceId and update this value
|
|
||||||
"10|ped_________::pedDatasourceId",
|
|
||||||
"Protein Ensemble Database"
|
|
||||||
)
|
|
||||||
PEDCollectedFrom.setDataInfo(DATA_INFO)
|
|
||||||
|
|
||||||
Map(
|
|
||||||
"ped" -> PEDCollectedFrom
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract_date(input: String): Option[String] = {
|
def extract_date(input: String): Option[String] = {
|
||||||
val d = Date_regex
|
val d = Date_regex
|
||||||
.map(pattern => {
|
.map(pattern => {
|
||||||
|
@ -89,38 +62,14 @@ object BioschemaToOAFTransformation {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
|
||||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
|
||||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||||
OafMapperUtils.structuredProperty(dt, q, null)
|
OafMapperUtils.structuredProperty(dt, q, null)
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateRelation(
|
|
||||||
sourceId: String,
|
|
||||||
targetId: String,
|
|
||||||
relClass: String,
|
|
||||||
cf: KeyValue,
|
|
||||||
di: DataInfo
|
|
||||||
): Relation = {
|
|
||||||
|
|
||||||
val r = new Relation
|
|
||||||
r.setSource(sourceId)
|
|
||||||
r.setTarget(targetId)
|
|
||||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
|
||||||
r.setRelClass(relClass)
|
|
||||||
r.setSubRelType(ModelConstants.OUTCOME)
|
|
||||||
r.setCollectedfrom(List(cf).asJava)
|
|
||||||
r.setDataInfo(di)
|
|
||||||
r
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateOAF(
|
def generateOAF(
|
||||||
input: String,
|
input: String,
|
||||||
exportLinks: Boolean
|
exportLinks: Boolean,
|
||||||
|
datasourceKey: String
|
||||||
): List[Oaf] = {
|
): List[Oaf] = {
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -142,15 +91,15 @@ object BioschemaToOAFTransformation {
|
||||||
List(
|
List(
|
||||||
OafMapperUtils.structuredProperty(
|
OafMapperUtils.structuredProperty(
|
||||||
pid,
|
pid,
|
||||||
"ped",
|
datasourceKey,
|
||||||
"ped",
|
datasourceKey,
|
||||||
ModelConstants.DNET_PID_TYPES,
|
ModelConstants.DNET_PID_TYPES,
|
||||||
ModelConstants.DNET_PID_TYPES,
|
ModelConstants.DNET_PID_TYPES,
|
||||||
DATA_INFO
|
DATA_INFO
|
||||||
)
|
)
|
||||||
).asJava
|
).asJava
|
||||||
)
|
)
|
||||||
result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
|
result.setId(OafMapperUtils.createOpenaireId(50, s"${datasourceKeyPrefix(datasourceKey)}::$pid", true))
|
||||||
result.setOriginalId(List(pid).asJava)
|
result.setOriginalId(List(pid).asJava)
|
||||||
|
|
||||||
result.setDataInfo(dataInfo)
|
result.setDataInfo(dataInfo)
|
||||||
|
@ -215,7 +164,7 @@ object BioschemaToOAFTransformation {
|
||||||
.asJava
|
.asJava
|
||||||
)
|
)
|
||||||
|
|
||||||
result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
result.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
||||||
|
|
||||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||||
|
|
||||||
|
@ -246,7 +195,7 @@ object BioschemaToOAFTransformation {
|
||||||
})
|
})
|
||||||
.asJava
|
.asJava
|
||||||
)
|
)
|
||||||
instance.setCollectedfrom(collectedFromMap("ped"))
|
instance.setCollectedfrom(collectedFromMap(datasourceKey))
|
||||||
instance.setPid(result.getPid)
|
instance.setPid(result.getPid)
|
||||||
|
|
||||||
result.setId(IdentifierFactory.createIdentifier(result))
|
result.setId(IdentifierFactory.createIdentifier(result))
|
||||||
|
@ -294,7 +243,8 @@ object BioschemaToOAFTransformation {
|
||||||
rels,
|
rels,
|
||||||
result.getId,
|
result.getId,
|
||||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
|
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
|
||||||
pid
|
pid,
|
||||||
|
datasourceKey
|
||||||
)
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -308,12 +258,13 @@ object BioschemaToOAFTransformation {
|
||||||
rels: List[RelatedIdentifierType],
|
rels: List[RelatedIdentifierType],
|
||||||
id: String,
|
id: String,
|
||||||
date: String,
|
date: String,
|
||||||
pid: String
|
pid: String,
|
||||||
|
datasourceKey: String
|
||||||
): List[Relation] = {
|
): List[Relation] = {
|
||||||
rels
|
rels
|
||||||
.map(r => {
|
.map(r => {
|
||||||
val rel = new Relation
|
val rel = new Relation
|
||||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
||||||
rel.setDataInfo(dataInfo)
|
rel.setDataInfo(dataInfo)
|
||||||
|
|
||||||
val subRelType = subRelTypeMapping(r.relationType).relType
|
val subRelType = subRelTypeMapping(r.relationType).relType
|
||||||
|
@ -341,16 +292,9 @@ object BioschemaToOAFTransformation {
|
||||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||||
)
|
)
|
||||||
rel.setSource(id)
|
rel.setSource(id)
|
||||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
||||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||||
rel
|
rel
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateDSId(input: String): String = {
|
|
||||||
val b = StringUtils.substringBefore(input, "::")
|
|
||||||
val a = StringUtils.substringAfter(input, "::")
|
|
||||||
s"10|$b::${DHPUtils.md5(a)}"
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,8 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
|
||||||
log.info(s"SourcePath is '$sourcePath'")
|
log.info(s"SourcePath is '$sourcePath'")
|
||||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||||
log.info(s"exportLinks is '$exportLinks'")
|
log.info(s"exportLinks is '$exportLinks'")
|
||||||
|
val datasourceKey = parser.get("datasourceKey").toLowerCase
|
||||||
|
log.info(s"datasourceKey is '$datasourceKey'")
|
||||||
|
|
||||||
// val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
// val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||||
// log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
// log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||||
|
@ -34,7 +36,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
|
||||||
|
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
generateBioschemaDataset(sourcePath, exportLinks, targetPath, spark)
|
generateBioschemaDataset(sourcePath, exportLinks, targetPath, datasourceKey, spark)
|
||||||
|
|
||||||
// reportTotalSize(targetPath, outputBasePath)
|
// reportTotalSize(targetPath, outputBasePath)
|
||||||
}
|
}
|
||||||
|
@ -63,6 +65,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
|
||||||
sourcePath: String,
|
sourcePath: String,
|
||||||
exportLinks: Boolean,
|
exportLinks: Boolean,
|
||||||
targetPath: String,
|
targetPath: String,
|
||||||
|
datasourceKey: String,
|
||||||
spark: SparkSession
|
spark: SparkSession
|
||||||
): Unit = {
|
): Unit = {
|
||||||
require(spark != null)
|
require(spark != null)
|
||||||
|
@ -72,7 +75,7 @@ class GenerateBioschemaDatasetSpark(propertyPath: String, args: Array[String], l
|
||||||
spark.createDataset(
|
spark.createDataset(
|
||||||
spark.sparkContext
|
spark.sparkContext
|
||||||
.textFile(sourcePath)
|
.textFile(sourcePath)
|
||||||
.flatMap(i => BioschemaToOAFTransformation.generateOAF(i, exportLinks))
|
.flatMap(i => BioschemaToOAFTransformation.generateOAF(i, exportLinks, datasourceKey))
|
||||||
),
|
),
|
||||||
targetPath
|
targetPath
|
||||||
)
|
)
|
||||||
|
|
|
@ -46,13 +46,13 @@ class BioschemaDataciteToOAFTest {
|
||||||
val instance = new GenerateBioschemaDatasetSpark(null, null, log)
|
val instance = new GenerateBioschemaDatasetSpark(null, null, log)
|
||||||
val targetPath = s"$workingDir/result"
|
val targetPath = s"$workingDir/result"
|
||||||
|
|
||||||
instance.generateBioschemaDataset(path, exportLinks = true, targetPath, spark)
|
instance.generateBioschemaDataset(path, exportLinks = true, targetPath, "ped", spark)
|
||||||
|
|
||||||
val total_items = spark.read.text(targetPath).count()
|
val total_items = spark.read.text(targetPath).count()
|
||||||
println(s"total_items: $total_items")
|
println(s"total_items: $total_items")
|
||||||
assertTrue(total_items == 50)
|
assertTrue(total_items == 50)
|
||||||
|
|
||||||
instance.generateBioschemaDataset(path, exportLinks = false, targetPath, spark)
|
instance.generateBioschemaDataset(path, exportLinks = false, targetPath, "ped", spark)
|
||||||
|
|
||||||
val total_datasets = spark.read.text(targetPath).count()
|
val total_datasets = spark.read.text(targetPath).count()
|
||||||
println(s"total_datasets: $total_datasets")
|
println(s"total_datasets: $total_datasets")
|
||||||
|
@ -67,7 +67,7 @@ class BioschemaDataciteToOAFTest {
|
||||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
|
||||||
.mkString
|
.mkString
|
||||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||||
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, true)
|
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, true, "ped")
|
||||||
res.foreach(r => {
|
res.foreach(r => {
|
||||||
println(mapper.writeValueAsString(r))
|
println(mapper.writeValueAsString(r))
|
||||||
println("----------------------------")
|
println("----------------------------")
|
||||||
|
|
Loading…
Reference in New Issue