Compare commits

...

2 Commits

5 changed files with 1014 additions and 0 deletions

View File

@ -0,0 +1,274 @@
package eu.dnetlib.dhp.bioschema
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
import java.io.InputStream
import java.time.format.DateTimeFormatter
import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/** This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(
relationType: String,
relatedIdentifier: String,
relatedIdentifierType: String
) {}
case class NameIdentifiersType(
nameIdentifierScheme: Option[String],
schemeUri: Option[String],
nameIdentifier: Option[String]
) {}
case class CreatorType(
nameType: Option[String],
nameIdentifiers: Option[List[NameIdentifiersType]],
name: Option[String],
familyName: Option[String],
givenName: Option[String],
affiliation: Option[List[String]]
) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(
funderIdentifierType: Option[String],
awardTitle: Option[String],
awardUri: Option[String],
funderName: Option[String],
funderIdentifier: Option[String],
awardNumber: Option[String]
) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation: String, inverse: String, relType: String)
class BioschemaModelConstants extends Serializable {}
object BioschemaModelConstants {
val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(
ModelConstants.REFERENCES,
ModelConstants.IS_REFERENCED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
ModelConstants.IS_REFERENCED_BY,
ModelConstants.REFERENCES,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.SUPPLEMENT
),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.SUPPLEMENT
),
ModelConstants.HAS_PART -> OAFRelations(
ModelConstants.HAS_PART,
ModelConstants.IS_PART_OF,
ModelConstants.PART
),
ModelConstants.IS_PART_OF -> OAFRelations(
ModelConstants.IS_PART_OF,
ModelConstants.HAS_PART,
ModelConstants.PART
),
ModelConstants.IS_VERSION_OF -> OAFRelations(
ModelConstants.IS_VERSION_OF,
ModelConstants.HAS_VERSION,
ModelConstants.VERSION
),
ModelConstants.HAS_VERSION -> OAFRelations(
ModelConstants.HAS_VERSION,
ModelConstants.IS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
ModelConstants.IS_CONTINUED_BY,
ModelConstants.CONTINUES,
ModelConstants.RELATIONSHIP
),
ModelConstants.CONTINUES -> OAFRelations(
ModelConstants.CONTINUES,
ModelConstants.IS_CONTINUED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SOURCE_OF -> OAFRelations(
ModelConstants.IS_SOURCE_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
ModelConstants.IS_DERIVED_FROM,
ModelConstants.IS_SOURCE_OF,
ModelConstants.VERSION
),
ModelConstants.CITES -> OAFRelations(
ModelConstants.CITES,
ModelConstants.IS_CITED_BY,
ModelConstants.CITATION
),
ModelConstants.IS_CITED_BY -> OAFRelations(
ModelConstants.IS_CITED_BY,
ModelConstants.CITES,
ModelConstants.CITATION
),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
ModelConstants.IS_VARIANT_FORM_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
ModelConstants.IS_OBSOLETED_BY,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.REVIEWS -> OAFRelations(
ModelConstants.REVIEWS,
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEW
),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEWS,
ModelConstants.REVIEW
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.COMPILES -> OAFRelations(
ModelConstants.COMPILES,
ModelConstants.IS_COMPILED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_COMPILED_BY -> OAFRelations(
ModelConstants.IS_COMPILED_BY,
ModelConstants.COMPILES,
ModelConstants.RELATIONSHIP
)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream != null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
trust
)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH
)
val df_it: DateTimeFormatter =
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile(
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
Pattern.MULTILINE
),
//M-D-Y
Pattern.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE
),
//D-M-Y
Pattern.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE
),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -0,0 +1,529 @@
package eu.dnetlib.dhp.bioschema
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
object BioschemaToOAFTransformation {
val mapper = new ObjectMapper()
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val collectedFromMap: Map[String, KeyValue] = {
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|ped_________::changeme",
"PED"
)
PEDCollectedFrom.setDataInfo(DATA_INFO)
Map(
"ped" -> PEDCollectedFrom
)
}
/** This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
* @return True if the record should be skipped
*/
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex
.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
})
.find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable =>
try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def fix_thai_date(input: String, format: String): String = {
try {
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_PUBLICATION_RESOURCE,
resourceTypeGeneral
)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
null
}
def getResult(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): Result = {
val typeQualifiers: (Qualifier, Qualifier) =
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
} else
List()
}
def generateOAF(
input: String,
ts: Long,
dateOfCollection: Long,
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
if (skip_record(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral =
(json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null)
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val pid = (json \ "id").extract[String]
result.setPid(
List(
OafMapperUtils.structuredProperty(
pid,
"ped",
"ped",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
result.setOriginalId(List(pid).asJava)
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(
titles
.filter(t => t.title.nonEmpty)
.map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(
t.title.get,
t.titleType.get,
t.titleType.get,
ModelConstants.DNET_DATACITE_TITLE,
ModelConstants.DNET_DATACITE_TITLE,
null
)
}
})
.asJava
)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
} else if (publication_year != null) {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result
.getInstance()
.get(0)
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
result.setRelevantdate(
dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d =>
(
d._1.get,
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
)
)
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2))
.asJava
)
result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined)
.map(d => OafMapperUtils.field(d.description.get, null))
.filter(s => s != null)
.asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
)
val instance = result.getInstance().get(0)
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
})
.find(q => q != null)
.map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier =
if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
instance.setCollectedfrom(collectedFromMap("ped"))
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
if (result.getId == null)
return List()
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
} else
List(result)
}
private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String
): List[Relation] = {
rels
.filter(r =>
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
})
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -0,0 +1,41 @@
{
"id": "PED00001#P38634_A_1",
"types": {
"resourceType": "Protein",
"resourceTypeGeneral": "Dataset"
},
"creators": [],
"identifiers": [
{
"identifier": "https://proteinensemble.org/PED00001#P38634_A_1",
"identifierType": "URL"
}
],
"relatedIdentifiers": [
{
"relationType": "CitedBy",
"relatedIdentifier": "https://identifiers.org/pubmed:20399186"
},
{
"relationType": "IsIdenticalTo",
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
}
],
"alternateIdentifiers": [
{
"alternateIdentifier": "https://identifiers.org/uniprot:P38634"
}
],
"descriptions": [],
"titles": [
{
"title": "Protein SIC1"
}
],
"dates": [
{
"date": "2021-12-09T21:10:30",
"dateType": "Collected"
}
]
}

View File

@ -0,0 +1,108 @@
package eu.dnetlib.dhp.bioschema
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
//import eu.dnetlib.dhp.bioschema.{BioschemaToOAFTransformation, GenerateDataciteDatasetSpark}
import eu.dnetlib.dhp.bioschema.BioschemaToOAFTransformation
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.{Files, Path}
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class BioschemaDataciteToOAFTest extends AbstractVocabularyTest {
private var workingDir: Path = null
val log: Logger = LoggerFactory.getLogger(getClass)
@BeforeEach
def setUp(): Unit = {
workingDir = Files.createTempDirectory(getClass.getSimpleName)
super.setUpVocabulary()
}
@AfterEach
def tearDown(): Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
}
@Test
def testDateMapping: Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
// @Test
// def testConvert(): Unit = {
//
// val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
//
// val conf = new SparkConf()
// val spark: SparkSession = SparkSession
// .builder()
// .config(conf)
// .appName(getClass.getSimpleName)
// .master("local[*]")
// .getOrCreate()
//
// implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
// val instance = new GenerateDataciteDatasetSpark(null, null, log)
// val targetPath = s"$workingDir/result"
//
// instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
//
// import spark.implicits._
//
// val nativeSize = spark.read.load(path).count()
//
// assertEquals(100, nativeSize)
//
// val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
//
// result
// .map(s => s.getClass.getSimpleName)
// .groupBy(col("value").alias("class"))
// .agg(count("value").alias("Total"))
// .show(false)
//
// val t = spark.read.load(targetPath).count()
//
// assertTrue(t > 0)
//
// spark.stop()
//
// }
@Test
def testMapping(): Unit = {
val record = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
.mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
res.foreach(r => {
println(mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -0,0 +1,62 @@
https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
PED
<property>
<name>workingPath</name>
<value>/data/bioschema/ped/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://proteinensemble.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
DISPROT
<property>
<name>workingPath</name>
<value>/data/bioschema/disprot/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://disprot.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
MOBIDB
<property>
<name>workingPath</name>
<value>/data/bioschema/mobidb/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://mobidb.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
<property>