dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformatio...

434 lines
13 KiB
Scala

package eu.dnetlib.dhp.bioschema
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.time.LocalDate
import scala.collection.JavaConverters._
object BioschemaToOAFTransformation {
val mapper = new ObjectMapper()
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val resolvedURL: Map[String, String] = Map(
"uniprot" -> "https://www.uniprot.org/uniprot/",
"pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/"
)
val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
//TODO create pedDatasourceId and update this value
"10|ped_________::pedDatasourceId",
"PED"
)
PEDCollectedFrom.setDataInfo(DATA_INFO)
Map(
"ped" -> PEDCollectedFrom
)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex
.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
})
.find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable =>
try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_PUBLICATION_RESOURCE,
resourceTypeGeneral
)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
null
}
def getResult(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): Result = {
val typeQualifiers: (Qualifier, Qualifier) =
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
// i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def generateOAF(
input: String,
ts: Long,
dateOfCollection: Long,
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral =
(json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null)
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val pid = (json \ "id").extract[String]
result.setPid(
List(
OafMapperUtils.structuredProperty(
pid,
"ped",
"ped",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
result.setOriginalId(List(pid).asJava)
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(
titles
.filter(t => t.title.nonEmpty)
.map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(
t.title.get,
t.titleType.get,
t.titleType.get,
ModelConstants.DNET_DATACITE_TITLE,
ModelConstants.DNET_DATACITE_TITLE,
null
)
}
})
.asJava
)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
} else if (publication_year != null) {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result
.getInstance()
.get(0)
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
result.setRelevantdate(
dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d =>
(
d._1.get,
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
)
)
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2))
.asJava
)
result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined)
.map(d => OafMapperUtils.field(d.description.get, null))
.filter(s => s != null)
.asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
)
val instance = result.getInstance().get(0)
instance.setCollectedfrom(collectedFromMap("ped"))
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
instance.setPid(result.getPid)
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = List()
if (result.getId == null)
return List()
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
)
val identifiers: List[RelatedIdentifierType] = for {
JObject(alternateIdentifier) <- json \\ "alternateIdentifiers"
JField("alternateIdentifier", JString(alternateIdentifierValue)) <- alternateIdentifier
} yield RelatedIdentifierType("IsIdenticalTo", alternateIdentifierValue, "URL")
relations = relations ::: generateRelations(
identifiers,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null,
pid
)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
} else
List(result)
}
private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String,
pid: String
): List[Relation] = {
rels
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
val foundResolvedURLId = resolvedURL.map(k => {
if (r.relatedIdentifier.contains(s"${k._1}:"))
k._1
else
null
}).find(s => s != null);
if (foundResolvedURLId.nonEmpty) {
val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:")
rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}")
} else
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setSource(id)
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
})
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}