2022-02-22 11:42:57 +01:00
|
|
|
package eu.dnetlib.dhp.bioschema
|
|
|
|
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
|
|
|
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils
|
|
|
|
import org.apache.commons.lang3.StringUtils
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
|
|
|
|
|
|
import java.time.LocalDate
|
|
|
|
import scala.collection.JavaConverters._
|
|
|
|
|
|
|
|
object BioschemaToOAFTransformation {
|
|
|
|
|
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
|
|
|
|
def extract_date(input: String): Option[String] = {
|
|
|
|
val d = Date_regex
|
|
|
|
.map(pattern => {
|
|
|
|
val matcher = pattern.matcher(input)
|
|
|
|
if (matcher.find())
|
|
|
|
matcher.group(0)
|
|
|
|
else
|
|
|
|
null
|
|
|
|
})
|
|
|
|
.find(s => s != null)
|
|
|
|
|
|
|
|
if (d.isDefined) {
|
|
|
|
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
|
|
|
try {
|
|
|
|
return Some(LocalDate.parse(a_date, df_en).toString)
|
|
|
|
} catch {
|
|
|
|
case _: Throwable =>
|
|
|
|
try {
|
|
|
|
return Some(LocalDate.parse(a_date, df_it).toString)
|
|
|
|
} catch {
|
|
|
|
case _: Throwable =>
|
|
|
|
return None
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
d
|
|
|
|
}
|
|
|
|
|
2022-03-15 17:36:48 +01:00
|
|
|
def getResult(resourceClassName: String): Result = {
|
2022-02-22 11:42:57 +01:00
|
|
|
val i = new Instance
|
2022-03-15 17:36:48 +01:00
|
|
|
resourceClassName.toUpperCase() match {
|
|
|
|
case "PROTEIN" =>
|
|
|
|
i.setInstancetype(
|
|
|
|
PROTEIN_RESOURCETYPE
|
|
|
|
)
|
|
|
|
val d = new OafDataset
|
|
|
|
d.setInstance(List(i).asJava)
|
|
|
|
d.setResourcetype(
|
|
|
|
PROTEIN_RESOURCETYPE
|
|
|
|
)
|
|
|
|
return d
|
|
|
|
}
|
2022-02-22 11:42:57 +01:00
|
|
|
null
|
|
|
|
}
|
|
|
|
|
|
|
|
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
|
|
|
OafMapperUtils.structuredProperty(dt, q, null)
|
|
|
|
}
|
|
|
|
|
|
|
|
def generateOAF(
|
|
|
|
input: String,
|
2022-03-03 12:31:29 +01:00
|
|
|
exportLinks: Boolean,
|
2022-03-15 17:36:48 +01:00
|
|
|
datasourceKey: String,
|
|
|
|
resourceClassName: String
|
2022-02-22 11:42:57 +01:00
|
|
|
): List[Oaf] = {
|
|
|
|
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
|
2022-03-15 17:36:48 +01:00
|
|
|
val result = getResult(resourceClassName)
|
2022-02-22 11:42:57 +01:00
|
|
|
if (result == null)
|
|
|
|
return List()
|
|
|
|
|
|
|
|
val pid = (json \ "id").extract[String]
|
|
|
|
|
|
|
|
result.setPid(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
pid,
|
2022-03-03 12:31:29 +01:00
|
|
|
datasourceKey,
|
|
|
|
datasourceKey,
|
2022-02-22 11:42:57 +01:00
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2022-03-03 12:31:29 +01:00
|
|
|
result.setId(OafMapperUtils.createOpenaireId(50, s"${datasourceKeyPrefix(datasourceKey)}::$pid", true))
|
2022-02-22 11:42:57 +01:00
|
|
|
result.setOriginalId(List(pid).asJava)
|
|
|
|
result.setDataInfo(dataInfo)
|
|
|
|
|
|
|
|
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
2022-04-06 13:08:48 +02:00
|
|
|
if (titles.isEmpty) {
|
2022-04-06 13:19:58 +02:00
|
|
|
throw new RuntimeException("Title not found")
|
2022-04-06 13:08:48 +02:00
|
|
|
}
|
2022-02-22 11:42:57 +01:00
|
|
|
result.setTitle(
|
|
|
|
titles
|
|
|
|
.filter(t => t.title.nonEmpty)
|
|
|
|
.map(t => {
|
2022-02-25 14:42:08 +01:00
|
|
|
OafMapperUtils
|
|
|
|
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)
|
2022-02-22 11:42:57 +01:00
|
|
|
})
|
|
|
|
.asJava
|
|
|
|
)
|
|
|
|
|
|
|
|
val dates = (json \\ "dates").extract[List[DateType]]
|
|
|
|
|
2022-03-15 17:36:48 +01:00
|
|
|
val collected_date = dates
|
2022-02-22 11:42:57 +01:00
|
|
|
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
2022-03-15 17:36:48 +01:00
|
|
|
.find(d => d.dateType.get.equalsIgnoreCase("collected"))
|
2022-02-22 11:42:57 +01:00
|
|
|
.map(d => extract_date(d.date.get))
|
|
|
|
.find(d => d != null && d.isDefined)
|
|
|
|
.map(d => d.get)
|
2022-03-15 17:36:48 +01:00
|
|
|
if (collected_date.isDefined) {
|
|
|
|
result.setDateofcollection(collected_date.get)
|
2022-02-22 11:42:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
result.setRelevantdate(
|
|
|
|
dates
|
|
|
|
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
|
|
|
.map(d => (extract_date(d.date.get), d.dateType.get))
|
|
|
|
.filter(d => d._1.isDefined)
|
|
|
|
.map(d =>
|
|
|
|
(
|
|
|
|
d._1.get,
|
2022-03-02 12:12:37 +01:00
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
d._2.toLowerCase(),
|
|
|
|
d._2.toLowerCase(),
|
|
|
|
ModelConstants.DNET_DATACITE_DATE,
|
|
|
|
ModelConstants.DNET_DATACITE_DATE
|
|
|
|
)
|
2022-02-22 11:42:57 +01:00
|
|
|
)
|
|
|
|
)
|
|
|
|
.filter(d => d._2 != null)
|
|
|
|
.map(d => generateOAFDate(d._1, d._2))
|
|
|
|
.asJava
|
|
|
|
)
|
|
|
|
|
2022-03-03 12:31:29 +01:00
|
|
|
result.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
2022-02-22 11:42:57 +01:00
|
|
|
|
|
|
|
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
|
|
|
|
|
|
|
result.setDescription(
|
|
|
|
descriptions
|
|
|
|
.filter(d => d.description.isDefined)
|
|
|
|
.map(d => OafMapperUtils.field(d.description.get, null))
|
|
|
|
.filter(s => s != null)
|
|
|
|
.asJava
|
|
|
|
)
|
|
|
|
|
2022-03-18 18:10:39 +01:00
|
|
|
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
|
|
|
|
|
|
|
result.setSubject(
|
|
|
|
subjects
|
|
|
|
.filter(s => s.value.nonEmpty && s.subjectScheme.nonEmpty && s.schemeURI.nonEmpty)
|
|
|
|
.map(s =>
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
s.value.get,
|
|
|
|
s.subjectScheme.get,
|
|
|
|
s.schemeURI.get,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
null
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.asJava
|
|
|
|
)
|
2022-02-22 11:42:57 +01:00
|
|
|
|
|
|
|
val instance = result.getInstance().get(0)
|
|
|
|
|
2022-02-25 14:42:08 +01:00
|
|
|
val ids: List[IdentifierType] = for {
|
|
|
|
JObject(identifiers) <- json \\ "identifiers"
|
|
|
|
JField("identifier", JString(identifier)) <- identifiers
|
|
|
|
JField("identifierType", JString(identifierType)) <- identifiers
|
|
|
|
} yield IdentifierType(identifier, identifierType)
|
|
|
|
|
|
|
|
instance.setUrl(
|
|
|
|
ids
|
|
|
|
.map(id => {
|
|
|
|
id.identifier
|
|
|
|
})
|
|
|
|
.asJava
|
|
|
|
)
|
2022-04-06 13:08:48 +02:00
|
|
|
if (instance.getUrl.isEmpty) {
|
2022-04-06 13:19:58 +02:00
|
|
|
throw new RuntimeException("Url not found")
|
2022-04-06 13:08:48 +02:00
|
|
|
}
|
2022-03-03 12:31:29 +01:00
|
|
|
instance.setCollectedfrom(collectedFromMap(datasourceKey))
|
2022-02-22 11:46:29 +01:00
|
|
|
instance.setPid(result.getPid)
|
2022-02-22 11:42:57 +01:00
|
|
|
|
|
|
|
result.setId(IdentifierFactory.createIdentifier(result))
|
2022-02-24 16:59:50 +01:00
|
|
|
var relations: List[Relation] = List()
|
2022-02-22 11:42:57 +01:00
|
|
|
|
|
|
|
if (result.getId == null)
|
|
|
|
return List()
|
|
|
|
|
2022-03-22 16:39:21 +01:00
|
|
|
val alternativeIdentifierUrls: List[AlternateIdentifierType] = for {
|
2022-02-25 14:42:08 +01:00
|
|
|
JObject(alternateIdentifiers) <- json \\ "alternateIdentifiers"
|
|
|
|
JField("alternateIdentifier", JString(alternateIdentifier)) <- alternateIdentifiers
|
2022-03-22 16:39:21 +01:00
|
|
|
} yield AlternateIdentifierType(alternateIdentifier)
|
|
|
|
|
|
|
|
val alternativeIdentifierUrl: AlternateIdentifierType = alternativeIdentifierUrls.asJava.get(0)
|
|
|
|
|
|
|
|
val alternativeIdentifiers = resolvedURLPattern
|
|
|
|
.map(pattern => {
|
|
|
|
if (alternativeIdentifierUrl.alternateIdentifier.startsWith(s"${pattern._1}")) {
|
|
|
|
val relatedId = StringUtils.substringAfter(alternativeIdentifierUrl.alternateIdentifier, s"${pattern._1}")
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
relatedId,
|
|
|
|
pattern._2,
|
|
|
|
pattern._2,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
dataInfo
|
|
|
|
)
|
|
|
|
} else
|
|
|
|
null
|
|
|
|
})
|
|
|
|
.find(s => s != null)
|
|
|
|
.get
|
|
|
|
|
|
|
|
val defaultAlternatedIdentifer: StructuredProperty = OafMapperUtils.structuredProperty(
|
|
|
|
pid,
|
|
|
|
datasourceKey,
|
|
|
|
datasourceKey,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
dataInfo
|
|
|
|
)
|
|
|
|
var finalAlternativeIdentifiers: List[StructuredProperty] = List()
|
|
|
|
finalAlternativeIdentifiers = List(alternativeIdentifiers) ::: List(defaultAlternatedIdentifer)
|
|
|
|
instance.setAlternateIdentifier(finalAlternativeIdentifiers.asJava)
|
2022-02-25 14:42:08 +01:00
|
|
|
|
2022-02-22 11:42:57 +01:00
|
|
|
if (exportLinks) {
|
|
|
|
val rels: List[RelatedIdentifierType] = for {
|
2022-02-25 14:42:08 +01:00
|
|
|
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
|
|
|
JField("relationType", JString(relationType)) <- relIdentifier
|
2022-02-22 11:42:57 +01:00
|
|
|
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
2022-02-25 14:42:08 +01:00
|
|
|
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
2022-02-22 11:42:57 +01:00
|
|
|
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
|
|
|
|
|
|
|
relations = relations ::: generateRelations(
|
|
|
|
rels,
|
|
|
|
result.getId,
|
2022-03-15 17:36:48 +01:00
|
|
|
null,
|
2022-03-03 12:31:29 +01:00
|
|
|
pid,
|
|
|
|
datasourceKey
|
2022-02-24 16:59:50 +01:00
|
|
|
)
|
|
|
|
|
2022-02-22 11:42:57 +01:00
|
|
|
}
|
|
|
|
if (relations != null && relations.nonEmpty) {
|
|
|
|
List(result) ::: relations
|
|
|
|
} else
|
|
|
|
List(result)
|
|
|
|
}
|
|
|
|
|
|
|
|
private def generateRelations(
|
|
|
|
rels: List[RelatedIdentifierType],
|
|
|
|
id: String,
|
2022-02-24 16:59:50 +01:00
|
|
|
date: String,
|
2022-03-03 12:31:29 +01:00
|
|
|
pid: String,
|
|
|
|
datasourceKey: String
|
2022-02-22 11:42:57 +01:00
|
|
|
): List[Relation] = {
|
|
|
|
rels
|
|
|
|
.map(r => {
|
|
|
|
val rel = new Relation
|
2022-03-03 12:31:29 +01:00
|
|
|
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
2022-02-22 11:42:57 +01:00
|
|
|
rel.setDataInfo(dataInfo)
|
|
|
|
|
|
|
|
val subRelType = subRelTypeMapping(r.relationType).relType
|
|
|
|
rel.setRelType(REL_TYPE_VALUE)
|
|
|
|
rel.setSubRelType(subRelType)
|
|
|
|
rel.setRelClass(r.relationType)
|
|
|
|
|
|
|
|
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
|
|
|
|
|
|
|
rel.setProperties(List(dateProps).asJava)
|
|
|
|
|
2022-03-22 16:39:21 +01:00
|
|
|
resolvedURLPattern
|
|
|
|
.map(p => {
|
|
|
|
if (r.relatedIdentifier.startsWith(s"${p._1}")) {
|
|
|
|
val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${p._1}")
|
|
|
|
rel.setTarget(
|
|
|
|
DHPUtils.generateUnresolvedIdentifier(relatedId, p._2)
|
|
|
|
)
|
|
|
|
} else
|
2022-02-25 14:42:08 +01:00
|
|
|
null
|
|
|
|
})
|
2022-03-22 16:39:21 +01:00
|
|
|
.find(s => s != null)
|
2022-02-22 11:42:57 +01:00
|
|
|
rel.setSource(id)
|
2022-03-03 12:31:29 +01:00
|
|
|
rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava)
|
2022-02-22 11:42:57 +01:00
|
|
|
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
|
|
|
rel
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|