added subjects to oaf generation

This commit is contained in:
Enrico Ottonello 2022-03-18 18:10:39 +01:00
parent db831e6f43
commit afe84c4244
5 changed files with 113 additions and 32 deletions

View File

@ -40,19 +40,14 @@ case class CreatorType(
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class SubjectType(
schemeURI: Option[String],
value: Option[String],
subjectScheme: Option[String]
) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(
funderIdentifierType: Option[String],
awardTitle: Option[String],
awardUri: Option[String],
funderName: Option[String],
funderIdentifier: Option[String],
awardNumber: Option[String]
) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation: String, inverse: String, relType: String)

View File

@ -158,9 +158,23 @@ object BioschemaToOAFTransformation {
.asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(
subjects
.filter(s => s.value.nonEmpty && s.subjectScheme.nonEmpty && s.schemeURI.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(
s.value.get,
s.subjectScheme.get,
s.schemeURI.get,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
val instance = result.getInstance().get(0)

View File

@ -1,10 +1,5 @@
{"id":"PED00001#P38634_A_1","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00001#P38634_A_1","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:20399186","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P38634","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P38634"}],"descriptions":[],"titles":[{"title":"Protein SIC1"}],"dates":[{"date":"2021-12-10T11:11:09","dateType":"Collected"}]}
{"id":"PED00002#O14558_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00002#O14558_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:24382496","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O14558","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O14558"}],"descriptions":[],"titles":[{"title":"Heat shock protein beta-6"}],"dates":[{"date":"2021-12-10T11:11:14","dateType":"Collected"}]}
{"id":"PED00002#O14558_B_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00002#O14558_B_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:24382496","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O14558","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O14558"}],"descriptions":[],"titles":[{"title":"Heat shock protein beta-6"}],"dates":[{"date":"2021-12-10T11:11:14","dateType":"Collected"}]}
{"id":"PED00003#Q16143_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00003#Q16143_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:25389903","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/Q16143","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:Q16143"}],"descriptions":[],"titles":[{"title":"Beta-synuclein"}],"dates":[{"date":"2021-12-10T11:11:17","dateType":"Collected"}]}
{"id":"PED00004#O43806_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00004#O43806_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:16214166","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O43806","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O43806"}],"descriptions":[],"titles":[{"title":"Cyclin-dependent kinase inhibitor 1B"}],"dates":[{"date":"2021-12-10T11:11:22","dateType":"Collected"}]}
{"id":"PED00005#O14558_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00005#O14558_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:24382496","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O14558","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O14558"}],"descriptions":[],"titles":[{"title":"Heat shock protein beta-6"}],"dates":[{"date":"2021-12-10T11:11:24","dateType":"Collected"}]}
{"id":"PED00005#O14558_B_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00005#O14558_B_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:24382496","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O14558","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O14558"}],"descriptions":[],"titles":[{"title":"Heat shock protein beta-6"}],"dates":[{"date":"2021-12-10T11:11:24","dateType":"Collected"}]}
{"id":"PED00006#P37840_A_1","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00006#P37840_A_1","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:25389903","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P37840","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P37840"}],"descriptions":[],"titles":[{"title":"Alpha-synuclein"}],"dates":[{"date":"2021-12-10T11:11:27","dateType":"Collected"}]}
{"id":"PED00006#Q16143_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00006#Q16143_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:25389903","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/Q16143","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:Q16143"}],"descriptions":[],"titles":[{"title":"Beta-synuclein"}],"dates":[{"date":"2021-12-10T11:11:27","dateType":"Collected"}]}
{"id":"PED00006#Q16143_A_2","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00006#Q16143_A_2","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:25389903","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/Q16143","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:Q16143"}],"descriptions":[],"titles":[{"title":"Beta-synuclein"}],"dates":[{"date":"2021-12-10T11:11:27","dateType":"Collected"}]}
{"id":"PED00111#O75880_A_1","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00111#O75880_A_1","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:16735468","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/O75880","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:O75880"}],"descriptions":[],"titles":[{"title":"PED00111#O75880_A_1 - Structural ensemble of the C-terminal region of Sco1 (132-301), apo structure."}],"dates":[{"date":"2021-12-10T11:17:16","dateType":"Collected"}],"subjects":[{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120","value":"NMR","subjectScheme":"IDPO:00120"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00141","value":"TOCSY","subjectScheme":"IDPO:00141"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00143","value":"NOESY","subjectScheme":"IDPO:00143"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00147","value":"HSQC","subjectScheme":"IDPO:00147"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00153","value":"HNHA","subjectScheme":"IDPO:00153"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00154","value":"HNCO","subjectScheme":"IDPO:00154"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00155","value":"HNCA","subjectScheme":"IDPO:00155"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00156","value":"HNCACO","subjectScheme":"IDPO:00156"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00157","value":"HNCOCA","subjectScheme":"IDPO:00157"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00158","value":"HNCACB","subjectScheme":"IDPO:00158"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00160","value":"CBCACONH","subjectScheme":"IDPO:00160"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00164","value":"CBCANH","subjectScheme":"IDPO:00164"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00175","value":"NOE","subjectScheme":"IDPO:00175"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00228","value":"AMBER","subjectScheme":"IDPO:00228"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl:00190","value":"DYANA","subjectScheme":"IDPO:00190"}]}
{"id":"PED00112#P19634_A_1","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00112#P19634_A_1","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:24840010","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P19634","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P19634"}],"descriptions":[],"titles":[{"title":"PED00112#P19634_A_1 - Structural ensemble of the TM VI-VII of the nhe1 isoform of the sodium/hydrogen exchanger (226-274)"}],"dates":[{"date":"2021-12-10T11:17:20","dateType":"Collected"}],"subjects":[{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120","value":"NMR","subjectScheme":"IDPO:00120"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00141","value":"TOCSY","subjectScheme":"IDPO:00141"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00143","value":"NOESY","subjectScheme":"IDPO:00143"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00147","value":"HSQC","subjectScheme":"IDPO:00147"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00167","value":"chemical shift","subjectScheme":"IDPO:00167"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl:00188","value":"Structure calculation","subjectScheme":"IDPO:00188"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl:00202","value":"NMRpipe","subjectScheme":"IDPO:00202"}]}
{"id":"PED00113#P04156_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00113#P04156_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P04156","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P04156"}],"descriptions":[],"titles":[{"title":"PED00113#P04156_A_0 - Structural ensemble of major prion protein (173-195)"}],"dates":[{"date":"2021-12-10T11:17:24","dateType":"Collected"}],"subjects":[{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120","value":"NMR","subjectScheme":"IDPO:00120"}]}
{"id":"PED00114#P27958_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00114#P27958_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsCitedBy","relatedIdentifier":"https://identifiers.org/pubmed:15247283","relatedIdentifierType":"URL"},{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P27958","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P27958"}],"descriptions":[],"titles":[{"title":"PED00114#P27958_A_0 - Structural ensemble of the membrane anchor domain of the nonstructural protein 5A (NS5A) of hepatitis C virus (1973-2003), in 50% tfe"}],"dates":[{"date":"2021-12-10T11:17:27","dateType":"Collected"}],"subjects":[{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120","value":"NMR","subjectScheme":"IDPO:00120"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00141","value":"TOCSY","subjectScheme":"IDPO:00141"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00143","value":"NOESY","subjectScheme":"IDPO:00143"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00147","value":"HSQC","subjectScheme":"IDPO:00147"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00175","value":"NOE","subjectScheme":"IDPO:00175"},{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl:00194","value":"X-PLOR","subjectScheme":"IDPO:00194"}]}
{"id":"PED00115#P04156_A_0","types":{"resourceType":"Protein","resourceTypeGeneral":"Dataset"},"creators":[],"identifiers":[{"identifier":"https://proteinensemble.org/PED00115#P04156_A_0","identifierType":"URL"}],"relatedIdentifiers":[{"relationType":"IsIdenticalTo","relatedIdentifier":"http://purl.uniprot.org/uniprot/P04156","relatedIdentifierType":"URL"}],"alternateIdentifiers":[{"alternateIdentifier":"https://identifiers.org/uniprot:P04156"}],"descriptions":[],"titles":[{"title":"PED00115#P04156_A_0 - Structural ensemble of major prion protein (173-195), mutant D178N"}],"dates":[{"date":"2021-12-10T11:17:30","dateType":"Collected"}],"subjects":[{"schemeURI":"https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120","value":"NMR","subjectScheme":"IDPO:00120"}]}

View File

@ -1,5 +1,5 @@
{
"id": "PED00001#P38634_A_1",
"id": "PED00111#O75880_A_1",
"types": {
"resourceType": "Protein",
"resourceTypeGeneral": "Dataset"
@ -7,37 +7,114 @@
"creators": [],
"identifiers": [
{
"identifier": "https://proteinensemble.org/PED00001#P38634_A_1",
"identifier": "https://proteinensemble.org/PED00111#O75880_A_1",
"identifierType": "URL"
}
],
"relatedIdentifiers": [
{
"relationType": "IsCitedBy",
"relatedIdentifier": "https://identifiers.org/pubmed:20399186",
"relatedIdentifier": "https://identifiers.org/pubmed:16735468",
"relatedIdentifierType": "URL"
},
{
"relationType": "IsIdenticalTo",
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634",
"relatedIdentifier": "http://purl.uniprot.org/uniprot/O75880",
"relatedIdentifierType": "URL"
}
],
"alternateIdentifiers": [
{
"alternateIdentifier": "https://identifiers.org/uniprot:P38634"
"alternateIdentifier": "https://identifiers.org/uniprot:O75880"
}
],
"descriptions": [],
"titles": [
{
"title": "Protein SIC1"
"title": "PED00111#O75880_A_1 - Structural ensemble of the C-terminal region of Sco1 (132-301), apo structure."
}
],
"dates": [
{
"date": "2021-12-10T11:11:09",
"date": "2021-12-10T11:17:16",
"dateType": "Collected"
}
],
"subjects": [
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00120",
"value": "NMR",
"subjectScheme": "IDPO:00120"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00141",
"value": "TOCSY",
"subjectScheme": "IDPO:00141"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00143",
"value": "NOESY",
"subjectScheme": "IDPO:00143"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00147",
"value": "HSQC",
"subjectScheme": "IDPO:00147"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00153",
"value": "HNHA",
"subjectScheme": "IDPO:00153"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00154",
"value": "HNCO",
"subjectScheme": "IDPO:00154"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00155",
"value": "HNCA",
"subjectScheme": "IDPO:00155"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00156",
"value": "HNCACO",
"subjectScheme": "IDPO:00156"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00157",
"value": "HNCOCA",
"subjectScheme": "IDPO:00157"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00158",
"value": "HNCACB",
"subjectScheme": "IDPO:00158"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00160",
"value": "CBCACONH",
"subjectScheme": "IDPO:00160"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00164",
"value": "CBCANH",
"subjectScheme": "IDPO:00164"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00175",
"value": "NOE",
"subjectScheme": "IDPO:00175"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00228",
"value": "AMBER",
"subjectScheme": "IDPO:00228"
},
{
"schemeURI": "https://disprot.org/assets/data/IDPO_v0.2.owl:00190",
"value": "DYANA",
"subjectScheme": "IDPO:00190"
}
]
}

View File

@ -50,13 +50,13 @@ class BioschemaDataciteToOAFTest {
val total_items = spark.read.text(targetPath).count()
println(s"total_items: $total_items")
assertTrue(total_items == 50)
assertTrue(total_items == 21)
instance.generateBioschemaDataset(path, exportLinks = false, targetPath, "ped", "protein", spark)
val total_datasets = spark.read.text(targetPath).count()
println(s"total_datasets: $total_datasets")
assertTrue(total_datasets == 10)
assertTrue(total_datasets == 5)
spark.stop()
}