forked from D-Net/dnet-hadoop
Compare commits
No commits in common. "2bc79c50f8a64858e2d093163bb5c05dfa00803b" and "5226d0a100eaedd4c06a832bb4ca2be68057d0a5" have entirely different histories.
2bc79c50f8
...
5226d0a100
|
@ -1,274 +0,0 @@
|
||||||
package eu.dnetlib.dhp.bioschema
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
|
||||||
|
|
||||||
import java.io.InputStream
|
|
||||||
import java.time.format.DateTimeFormatter
|
|
||||||
import java.util.Locale
|
|
||||||
import java.util.regex.Pattern
|
|
||||||
import scala.io.Source
|
|
||||||
|
|
||||||
/** This class represent the dataModel of the input Dataset of Datacite
|
|
||||||
* @param doi THE DOI
|
|
||||||
* @param timestamp timestamp of last update date
|
|
||||||
* @param isActive the record is active or deleted
|
|
||||||
* @param json the json native records
|
|
||||||
*/
|
|
||||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
|
||||||
|
|
||||||
/*
|
|
||||||
The following class are utility class used for the mapping from
|
|
||||||
json datacite to OAF Shema
|
|
||||||
*/
|
|
||||||
case class RelatedIdentifierType(
|
|
||||||
relationType: String,
|
|
||||||
relatedIdentifier: String,
|
|
||||||
relatedIdentifierType: String
|
|
||||||
) {}
|
|
||||||
|
|
||||||
case class NameIdentifiersType(
|
|
||||||
nameIdentifierScheme: Option[String],
|
|
||||||
schemeUri: Option[String],
|
|
||||||
nameIdentifier: Option[String]
|
|
||||||
) {}
|
|
||||||
|
|
||||||
case class CreatorType(
|
|
||||||
nameType: Option[String],
|
|
||||||
nameIdentifiers: Option[List[NameIdentifiersType]],
|
|
||||||
name: Option[String],
|
|
||||||
familyName: Option[String],
|
|
||||||
givenName: Option[String],
|
|
||||||
affiliation: Option[List[String]]
|
|
||||||
) {}
|
|
||||||
|
|
||||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
|
||||||
|
|
||||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
|
||||||
|
|
||||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
|
||||||
|
|
||||||
case class FundingReferenceType(
|
|
||||||
funderIdentifierType: Option[String],
|
|
||||||
awardTitle: Option[String],
|
|
||||||
awardUri: Option[String],
|
|
||||||
funderName: Option[String],
|
|
||||||
funderIdentifier: Option[String],
|
|
||||||
awardNumber: Option[String]
|
|
||||||
) {}
|
|
||||||
|
|
||||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
|
||||||
|
|
||||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
|
||||||
|
|
||||||
class BioschemaModelConstants extends Serializable {}
|
|
||||||
|
|
||||||
object BioschemaModelConstants {
|
|
||||||
|
|
||||||
val REL_TYPE_VALUE: String = "resultResult"
|
|
||||||
val DATE_RELATION_KEY = "RelationDate"
|
|
||||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
|
||||||
val DOI_CLASS = "doi"
|
|
||||||
val SUBJ_CLASS = "keywords"
|
|
||||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
|
||||||
|
|
||||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
|
||||||
ModelConstants.REFERENCES -> OAFRelations(
|
|
||||||
ModelConstants.REFERENCES,
|
|
||||||
ModelConstants.IS_REFERENCED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_REFERENCED_BY,
|
|
||||||
ModelConstants.REFERENCES,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO,
|
|
||||||
ModelConstants.SUPPLEMENT
|
|
||||||
),
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
|
||||||
ModelConstants.IS_SUPPLEMENT_TO,
|
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
||||||
ModelConstants.SUPPLEMENT
|
|
||||||
),
|
|
||||||
ModelConstants.HAS_PART -> OAFRelations(
|
|
||||||
ModelConstants.HAS_PART,
|
|
||||||
ModelConstants.IS_PART_OF,
|
|
||||||
ModelConstants.PART
|
|
||||||
),
|
|
||||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_PART_OF,
|
|
||||||
ModelConstants.HAS_PART,
|
|
||||||
ModelConstants.PART
|
|
||||||
),
|
|
||||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_VERSION_OF,
|
|
||||||
ModelConstants.HAS_VERSION,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
|
||||||
ModelConstants.HAS_VERSION,
|
|
||||||
ModelConstants.IS_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
|
||||||
ModelConstants.IS_IDENTICAL_TO,
|
|
||||||
ModelConstants.IS_IDENTICAL_TO,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_CONTINUED_BY,
|
|
||||||
ModelConstants.CONTINUES,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.CONTINUES -> OAFRelations(
|
|
||||||
ModelConstants.CONTINUES,
|
|
||||||
ModelConstants.IS_CONTINUED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_SOURCE_OF,
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.IS_SOURCE_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.CITES -> OAFRelations(
|
|
||||||
ModelConstants.CITES,
|
|
||||||
ModelConstants.IS_CITED_BY,
|
|
||||||
ModelConstants.CITATION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_CITED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_CITED_BY,
|
|
||||||
ModelConstants.CITES,
|
|
||||||
ModelConstants.CITATION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
|
||||||
ModelConstants.IS_VARIANT_FORM_OF,
|
|
||||||
ModelConstants.IS_DERIVED_FROM,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_OBSOLETED_BY,
|
|
||||||
ModelConstants.IS_NEW_VERSION_OF,
|
|
||||||
ModelConstants.VERSION
|
|
||||||
),
|
|
||||||
ModelConstants.REVIEWS -> OAFRelations(
|
|
||||||
ModelConstants.REVIEWS,
|
|
||||||
ModelConstants.IS_REVIEWED_BY,
|
|
||||||
ModelConstants.REVIEW
|
|
||||||
),
|
|
||||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_REVIEWED_BY,
|
|
||||||
ModelConstants.REVIEWS,
|
|
||||||
ModelConstants.REVIEW
|
|
||||||
),
|
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_DOCUMENTED_BY,
|
|
||||||
ModelConstants.DOCUMENTS,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.COMPILES -> OAFRelations(
|
|
||||||
ModelConstants.COMPILES,
|
|
||||||
ModelConstants.IS_COMPILED_BY,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
),
|
|
||||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
|
||||||
ModelConstants.IS_COMPILED_BY,
|
|
||||||
ModelConstants.COMPILES,
|
|
||||||
ModelConstants.RELATIONSHIP
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val datacite_filter: List[String] = {
|
|
||||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
|
||||||
require(stream != null)
|
|
||||||
Source.fromInputStream(stream).getLines().toList
|
|
||||||
}
|
|
||||||
|
|
||||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
|
||||||
false,
|
|
||||||
null,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
|
||||||
trust
|
|
||||||
)
|
|
||||||
|
|
||||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
|
||||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
|
||||||
Locale.ENGLISH
|
|
||||||
)
|
|
||||||
|
|
||||||
val df_it: DateTimeFormatter =
|
|
||||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
|
||||||
|
|
||||||
val funder_regex: List[(Pattern, String)] = List(
|
|
||||||
(
|
|
||||||
Pattern.compile(
|
|
||||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
|
||||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
|
||||||
),
|
|
||||||
"40|corda__h2020::"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Pattern.compile(
|
|
||||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
|
||||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
|
||||||
),
|
|
||||||
"40|corda_______::"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val Date_regex: List[Pattern] = List(
|
|
||||||
//Y-M-D
|
|
||||||
Pattern.compile(
|
|
||||||
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
|
||||||
Pattern.MULTILINE
|
|
||||||
),
|
|
||||||
//M-D-Y
|
|
||||||
Pattern.compile(
|
|
||||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
|
||||||
Pattern.MULTILINE
|
|
||||||
),
|
|
||||||
//D-M-Y
|
|
||||||
Pattern.compile(
|
|
||||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
|
||||||
Pattern.MULTILINE
|
|
||||||
),
|
|
||||||
//Y
|
|
||||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
|
||||||
)
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,529 +0,0 @@
|
||||||
package eu.dnetlib.dhp.bioschema
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
|
||||||
import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
|
||||||
import org.apache.commons.lang3.StringUtils
|
|
||||||
import org.json4s.DefaultFormats
|
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
|
||||||
|
|
||||||
import java.text.SimpleDateFormat
|
|
||||||
import java.time.LocalDate
|
|
||||||
import java.time.chrono.ThaiBuddhistDate
|
|
||||||
import java.time.format.DateTimeFormatter
|
|
||||||
import java.util.{Date, Locale}
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
import scala.io.{Codec, Source}
|
|
||||||
|
|
||||||
object BioschemaToOAFTransformation {
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
|
||||||
|
|
||||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
|
||||||
false,
|
|
||||||
null,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
|
||||||
"0.9"
|
|
||||||
)
|
|
||||||
|
|
||||||
val collectedFromMap: Map[String, KeyValue] = {
|
|
||||||
val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
||||||
"10|ped_________::changeme",
|
|
||||||
"PED"
|
|
||||||
)
|
|
||||||
PEDCollectedFrom.setDataInfo(DATA_INFO)
|
|
||||||
|
|
||||||
Map(
|
|
||||||
"ped" -> PEDCollectedFrom
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This method should skip record if json contains invalid text
|
|
||||||
* defined in gile datacite_filter
|
|
||||||
*
|
|
||||||
* @param json
|
|
||||||
* @return True if the record should be skipped
|
|
||||||
*/
|
|
||||||
def skip_record(json: String): Boolean = {
|
|
||||||
datacite_filter.exists(f => json.contains(f))
|
|
||||||
}
|
|
||||||
|
|
||||||
@deprecated("this method will be removed", "dhp")
|
|
||||||
def toActionSet(item: Oaf): (String, String) = {
|
|
||||||
val mapper = new ObjectMapper()
|
|
||||||
|
|
||||||
item match {
|
|
||||||
case dataset: OafDataset =>
|
|
||||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
|
||||||
a.setClazz(classOf[OafDataset])
|
|
||||||
a.setPayload(dataset)
|
|
||||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
|
||||||
case publication: Publication =>
|
|
||||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
|
||||||
a.setClazz(classOf[Publication])
|
|
||||||
a.setPayload(publication)
|
|
||||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
|
||||||
case software: Software =>
|
|
||||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
|
||||||
a.setClazz(classOf[Software])
|
|
||||||
a.setPayload(software)
|
|
||||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
|
||||||
case orp: OtherResearchProduct =>
|
|
||||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
|
||||||
a.setClazz(classOf[OtherResearchProduct])
|
|
||||||
a.setPayload(orp)
|
|
||||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
|
||||||
|
|
||||||
case relation: Relation =>
|
|
||||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
|
||||||
a.setClazz(classOf[Relation])
|
|
||||||
a.setPayload(relation)
|
|
||||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
|
||||||
case _ =>
|
|
||||||
null
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def embargo_end(embargo_end_date: String): Boolean = {
|
|
||||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
|
||||||
val td = LocalDate.now()
|
|
||||||
td.isAfter(dt)
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract_date(input: String): Option[String] = {
|
|
||||||
val d = Date_regex
|
|
||||||
.map(pattern => {
|
|
||||||
val matcher = pattern.matcher(input)
|
|
||||||
if (matcher.find())
|
|
||||||
matcher.group(0)
|
|
||||||
else
|
|
||||||
null
|
|
||||||
})
|
|
||||||
.find(s => s != null)
|
|
||||||
|
|
||||||
if (d.isDefined) {
|
|
||||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
|
||||||
try {
|
|
||||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
|
||||||
} catch {
|
|
||||||
case _: Throwable =>
|
|
||||||
try {
|
|
||||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
|
||||||
} catch {
|
|
||||||
case _: Throwable =>
|
|
||||||
return None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
d
|
|
||||||
}
|
|
||||||
|
|
||||||
def fix_thai_date(input: String, format: String): String = {
|
|
||||||
try {
|
|
||||||
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
|
|
||||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
|
||||||
LocalDate.from(d).toString
|
|
||||||
} catch {
|
|
||||||
case _: Throwable => ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getTypeQualifier(
|
|
||||||
resourceType: String,
|
|
||||||
resourceTypeGeneral: String,
|
|
||||||
schemaOrg: String,
|
|
||||||
vocabularies: VocabularyGroup
|
|
||||||
): (Qualifier, Qualifier) = {
|
|
||||||
if (resourceType != null && resourceType.nonEmpty) {
|
|
||||||
val typeQualifier =
|
|
||||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
|
||||||
if (typeQualifier != null)
|
|
||||||
return (
|
|
||||||
typeQualifier,
|
|
||||||
vocabularies.getSynonymAsQualifier(
|
|
||||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
|
||||||
typeQualifier.getClassid
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
|
||||||
val typeQualifier =
|
|
||||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
|
||||||
if (typeQualifier != null)
|
|
||||||
return (
|
|
||||||
typeQualifier,
|
|
||||||
vocabularies.getSynonymAsQualifier(
|
|
||||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
|
||||||
typeQualifier.getClassid
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
}
|
|
||||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
|
||||||
val typeQualifier = vocabularies.getSynonymAsQualifier(
|
|
||||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
||||||
resourceTypeGeneral
|
|
||||||
)
|
|
||||||
if (typeQualifier != null)
|
|
||||||
return (
|
|
||||||
typeQualifier,
|
|
||||||
vocabularies.getSynonymAsQualifier(
|
|
||||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
|
||||||
typeQualifier.getClassid
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
}
|
|
||||||
null
|
|
||||||
}
|
|
||||||
|
|
||||||
def getResult(
|
|
||||||
resourceType: String,
|
|
||||||
resourceTypeGeneral: String,
|
|
||||||
schemaOrg: String,
|
|
||||||
vocabularies: VocabularyGroup
|
|
||||||
): Result = {
|
|
||||||
val typeQualifiers: (Qualifier, Qualifier) =
|
|
||||||
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
|
||||||
if (typeQualifiers == null)
|
|
||||||
return null
|
|
||||||
val i = new Instance
|
|
||||||
i.setInstancetype(typeQualifiers._1)
|
|
||||||
typeQualifiers._2.getClassname match {
|
|
||||||
case "dataset" =>
|
|
||||||
val r = new OafDataset
|
|
||||||
r.setInstance(List(i).asJava)
|
|
||||||
return r
|
|
||||||
case "publication" =>
|
|
||||||
val r = new Publication
|
|
||||||
r.setInstance(List(i).asJava)
|
|
||||||
return r
|
|
||||||
case "software" =>
|
|
||||||
val r = new Software
|
|
||||||
r.setInstance(List(i).asJava)
|
|
||||||
return r
|
|
||||||
case "other" =>
|
|
||||||
val r = new OtherResearchProduct
|
|
||||||
r.setInstance(List(i).asJava)
|
|
||||||
return r
|
|
||||||
}
|
|
||||||
null
|
|
||||||
}
|
|
||||||
|
|
||||||
def available_date(input: String): Boolean = {
|
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
||||||
lazy val json: org.json4s.JValue = parse(input)
|
|
||||||
val l: List[String] = for {
|
|
||||||
JObject(dates) <- json \\ "dates"
|
|
||||||
JField("dateType", JString(dateTypes)) <- dates
|
|
||||||
} yield dateTypes
|
|
||||||
|
|
||||||
l.exists(p => p.equalsIgnoreCase("available"))
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
|
||||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
|
||||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
|
||||||
OafMapperUtils.structuredProperty(dt, q, null)
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateRelation(
|
|
||||||
sourceId: String,
|
|
||||||
targetId: String,
|
|
||||||
relClass: String,
|
|
||||||
cf: KeyValue,
|
|
||||||
di: DataInfo
|
|
||||||
): Relation = {
|
|
||||||
|
|
||||||
val r = new Relation
|
|
||||||
r.setSource(sourceId)
|
|
||||||
r.setTarget(targetId)
|
|
||||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
|
||||||
r.setRelClass(relClass)
|
|
||||||
r.setSubRelType(ModelConstants.OUTCOME)
|
|
||||||
r.setCollectedfrom(List(cf).asJava)
|
|
||||||
r.setDataInfo(di)
|
|
||||||
r
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
|
||||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
|
||||||
|
|
||||||
if (match_pattern.isDefined) {
|
|
||||||
val m = match_pattern.get._1
|
|
||||||
val p = match_pattern.get._2
|
|
||||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
|
||||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
|
||||||
List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
|
|
||||||
} else
|
|
||||||
List()
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateOAF(
|
|
||||||
input: String,
|
|
||||||
ts: Long,
|
|
||||||
dateOfCollection: Long,
|
|
||||||
vocabularies: VocabularyGroup,
|
|
||||||
exportLinks: Boolean
|
|
||||||
): List[Oaf] = {
|
|
||||||
if (skip_record(input))
|
|
||||||
return List()
|
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
||||||
lazy val json = parse(input)
|
|
||||||
|
|
||||||
val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null)
|
|
||||||
val resourceTypeGeneral =
|
|
||||||
(json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
|
||||||
val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null)
|
|
||||||
|
|
||||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
|
||||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
|
||||||
if (result == null)
|
|
||||||
return List()
|
|
||||||
|
|
||||||
val pid = (json \ "id").extract[String]
|
|
||||||
|
|
||||||
result.setPid(
|
|
||||||
List(
|
|
||||||
OafMapperUtils.structuredProperty(
|
|
||||||
pid,
|
|
||||||
"ped",
|
|
||||||
"ped",
|
|
||||||
ModelConstants.DNET_PID_TYPES,
|
|
||||||
ModelConstants.DNET_PID_TYPES,
|
|
||||||
DATA_INFO
|
|
||||||
)
|
|
||||||
).asJava
|
|
||||||
)
|
|
||||||
result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
|
|
||||||
result.setOriginalId(List(pid).asJava)
|
|
||||||
|
|
||||||
result.setDataInfo(dataInfo)
|
|
||||||
|
|
||||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
|
||||||
|
|
||||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
|
||||||
|
|
||||||
result.setTitle(
|
|
||||||
titles
|
|
||||||
.filter(t => t.title.nonEmpty)
|
|
||||||
.map(t => {
|
|
||||||
if (t.titleType.isEmpty) {
|
|
||||||
OafMapperUtils
|
|
||||||
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
|
||||||
} else {
|
|
||||||
OafMapperUtils.structuredProperty(
|
|
||||||
t.title.get,
|
|
||||||
t.titleType.get,
|
|
||||||
t.titleType.get,
|
|
||||||
ModelConstants.DNET_DATACITE_TITLE,
|
|
||||||
ModelConstants.DNET_DATACITE_TITLE,
|
|
||||||
null
|
|
||||||
)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.asJava
|
|
||||||
)
|
|
||||||
|
|
||||||
val dates = (json \\ "dates").extract[List[DateType]]
|
|
||||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
|
||||||
|
|
||||||
val i_date = dates
|
|
||||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
|
||||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
|
||||||
.map(d => extract_date(d.date.get))
|
|
||||||
val a_date: Option[String] = dates
|
|
||||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
|
||||||
.map(d => extract_date(d.date.get))
|
|
||||||
.find(d => d != null && d.isDefined)
|
|
||||||
.map(d => d.get)
|
|
||||||
|
|
||||||
if (a_date.isDefined) {
|
|
||||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
|
||||||
}
|
|
||||||
if (i_date.isDefined && i_date.get.isDefined) {
|
|
||||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
|
||||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
|
||||||
} else if (publication_year != null) {
|
|
||||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
|
||||||
result
|
|
||||||
.getInstance()
|
|
||||||
.get(0)
|
|
||||||
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
|
||||||
}
|
|
||||||
|
|
||||||
result.setRelevantdate(
|
|
||||||
dates
|
|
||||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
|
||||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
|
||||||
.filter(d => d._1.isDefined)
|
|
||||||
.map(d =>
|
|
||||||
(
|
|
||||||
d._1.get,
|
|
||||||
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.filter(d => d._2 != null)
|
|
||||||
.map(d => generateOAFDate(d._1, d._2))
|
|
||||||
.asJava
|
|
||||||
)
|
|
||||||
|
|
||||||
result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
|
||||||
|
|
||||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
|
||||||
|
|
||||||
result.setDescription(
|
|
||||||
descriptions
|
|
||||||
.filter(d => d.description.isDefined)
|
|
||||||
.map(d => OafMapperUtils.field(d.description.get, null))
|
|
||||||
.filter(s => s != null)
|
|
||||||
.asJava
|
|
||||||
)
|
|
||||||
|
|
||||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
|
||||||
if (publisher != null)
|
|
||||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
|
||||||
|
|
||||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
|
||||||
|
|
||||||
if (language != null)
|
|
||||||
result.setLanguage(
|
|
||||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
|
|
||||||
)
|
|
||||||
|
|
||||||
val instance = result.getInstance().get(0)
|
|
||||||
|
|
||||||
val accessRights: List[String] = for {
|
|
||||||
JObject(rightsList) <- json \\ "rightsList"
|
|
||||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
|
||||||
} yield rightsUri
|
|
||||||
|
|
||||||
val aRights: Option[AccessRight] = accessRights
|
|
||||||
.map(r => {
|
|
||||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
|
||||||
})
|
|
||||||
.find(q => q != null)
|
|
||||||
.map(q => {
|
|
||||||
val a = new AccessRight
|
|
||||||
a.setClassid(q.getClassid)
|
|
||||||
a.setClassname(q.getClassname)
|
|
||||||
a.setSchemeid(q.getSchemeid)
|
|
||||||
a.setSchemename(q.getSchemename)
|
|
||||||
a
|
|
||||||
})
|
|
||||||
|
|
||||||
val access_rights_qualifier =
|
|
||||||
if (aRights.isDefined) aRights.get
|
|
||||||
else
|
|
||||||
OafMapperUtils.accessRight(
|
|
||||||
ModelConstants.UNKNOWN,
|
|
||||||
ModelConstants.NOT_AVAILABLE,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES
|
|
||||||
)
|
|
||||||
|
|
||||||
instance.setCollectedfrom(collectedFromMap("ped"))
|
|
||||||
instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
|
|
||||||
instance.setAccessright(access_rights_qualifier)
|
|
||||||
instance.setPid(result.getPid)
|
|
||||||
val license = accessRights
|
|
||||||
.find(r =>
|
|
||||||
r.startsWith("http") && r.matches(
|
|
||||||
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if (license.isDefined)
|
|
||||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
|
||||||
|
|
||||||
val awardUris: List[String] = for {
|
|
||||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
|
||||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
|
||||||
} yield awardUri
|
|
||||||
|
|
||||||
result.setId(IdentifierFactory.createIdentifier(result))
|
|
||||||
var relations: List[Relation] =
|
|
||||||
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
|
||||||
|
|
||||||
if (result.getId == null)
|
|
||||||
return List()
|
|
||||||
|
|
||||||
if (exportLinks) {
|
|
||||||
val rels: List[RelatedIdentifierType] = for {
|
|
||||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
|
||||||
JField("relationType", JString(relationType)) <- relIdentifier
|
|
||||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
|
||||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
|
||||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
|
||||||
|
|
||||||
relations = relations ::: generateRelations(
|
|
||||||
rels,
|
|
||||||
result.getId,
|
|
||||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if (relations != null && relations.nonEmpty) {
|
|
||||||
List(result) ::: relations
|
|
||||||
} else
|
|
||||||
List(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def generateRelations(
|
|
||||||
rels: List[RelatedIdentifierType],
|
|
||||||
id: String,
|
|
||||||
date: String
|
|
||||||
): List[Relation] = {
|
|
||||||
rels
|
|
||||||
.filter(r =>
|
|
||||||
subRelTypeMapping
|
|
||||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
|
||||||
)
|
|
||||||
.map(r => {
|
|
||||||
val rel = new Relation
|
|
||||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
|
||||||
rel.setDataInfo(dataInfo)
|
|
||||||
|
|
||||||
val subRelType = subRelTypeMapping(r.relationType).relType
|
|
||||||
rel.setRelType(REL_TYPE_VALUE)
|
|
||||||
rel.setSubRelType(subRelType)
|
|
||||||
rel.setRelClass(r.relationType)
|
|
||||||
|
|
||||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
|
||||||
|
|
||||||
rel.setProperties(List(dateProps).asJava)
|
|
||||||
|
|
||||||
rel.setSource(id)
|
|
||||||
rel.setTarget(
|
|
||||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
|
||||||
)
|
|
||||||
rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
|
|
||||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
|
||||||
rel
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
def generateDSId(input: String): String = {
|
|
||||||
val b = StringUtils.substringBefore(input, "::")
|
|
||||||
val a = StringUtils.substringAfter(input, "::")
|
|
||||||
s"10|$b::${DHPUtils.md5(a)}"
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,41 +0,0 @@
|
||||||
{
|
|
||||||
"id": "PED00001#P38634_A_1",
|
|
||||||
"types": {
|
|
||||||
"resourceType": "Protein",
|
|
||||||
"resourceTypeGeneral": "Dataset"
|
|
||||||
},
|
|
||||||
"creators": [],
|
|
||||||
"identifiers": [
|
|
||||||
{
|
|
||||||
"identifier": "https://proteinensemble.org/PED00001#P38634_A_1",
|
|
||||||
"identifierType": "URL"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"relatedIdentifiers": [
|
|
||||||
{
|
|
||||||
"relationType": "CitedBy",
|
|
||||||
"relatedIdentifier": "https://identifiers.org/pubmed:20399186"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"relationType": "IsIdenticalTo",
|
|
||||||
"relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"alternateIdentifiers": [
|
|
||||||
{
|
|
||||||
"alternateIdentifier": "https://identifiers.org/uniprot:P38634"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"descriptions": [],
|
|
||||||
"titles": [
|
|
||||||
{
|
|
||||||
"title": "Protein SIC1"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"dates": [
|
|
||||||
{
|
|
||||||
"date": "2021-12-09T21:10:30",
|
|
||||||
"dateType": "Collected"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,108 +0,0 @@
|
||||||
package eu.dnetlib.dhp.bioschema
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
|
||||||
//import eu.dnetlib.dhp.bioschema.{BioschemaToOAFTransformation, GenerateDataciteDatasetSpark}
|
|
||||||
import eu.dnetlib.dhp.bioschema.BioschemaToOAFTransformation
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
|
||||||
import org.apache.commons.io.FileUtils
|
|
||||||
import org.apache.spark.SparkConf
|
|
||||||
import org.apache.spark.sql.functions.{col, count}
|
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
|
||||||
import org.junit.jupiter.api.Assertions._
|
|
||||||
import org.junit.jupiter.api.extension.ExtendWith
|
|
||||||
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
|
|
||||||
import org.mockito.junit.jupiter.MockitoExtension
|
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
|
||||||
|
|
||||||
import java.nio.file.{Files, Path}
|
|
||||||
import java.text.SimpleDateFormat
|
|
||||||
import java.util.Locale
|
|
||||||
import scala.io.Source
|
|
||||||
|
|
||||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
|
||||||
class BioschemaDataciteToOAFTest extends AbstractVocabularyTest {
|
|
||||||
|
|
||||||
private var workingDir: Path = null
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
def setUp(): Unit = {
|
|
||||||
|
|
||||||
workingDir = Files.createTempDirectory(getClass.getSimpleName)
|
|
||||||
super.setUpVocabulary()
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterEach
|
|
||||||
def tearDown(): Unit = {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile)
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
def testDateMapping: Unit = {
|
|
||||||
val inputDate = "2021-07-14T11:52:54+0000"
|
|
||||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
|
||||||
val dt = ISO8601FORMAT.parse(inputDate)
|
|
||||||
println(dt.getTime)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// @Test
|
|
||||||
// def testConvert(): Unit = {
|
|
||||||
//
|
|
||||||
// val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
|
|
||||||
//
|
|
||||||
// val conf = new SparkConf()
|
|
||||||
// val spark: SparkSession = SparkSession
|
|
||||||
// .builder()
|
|
||||||
// .config(conf)
|
|
||||||
// .appName(getClass.getSimpleName)
|
|
||||||
// .master("local[*]")
|
|
||||||
// .getOrCreate()
|
|
||||||
//
|
|
||||||
// implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
|
||||||
// val instance = new GenerateDataciteDatasetSpark(null, null, log)
|
|
||||||
// val targetPath = s"$workingDir/result"
|
|
||||||
//
|
|
||||||
// instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
|
|
||||||
//
|
|
||||||
// import spark.implicits._
|
|
||||||
//
|
|
||||||
// val nativeSize = spark.read.load(path).count()
|
|
||||||
//
|
|
||||||
// assertEquals(100, nativeSize)
|
|
||||||
//
|
|
||||||
// val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
|
||||||
//
|
|
||||||
// result
|
|
||||||
// .map(s => s.getClass.getSimpleName)
|
|
||||||
// .groupBy(col("value").alias("class"))
|
|
||||||
// .agg(count("value").alias("Total"))
|
|
||||||
// .show(false)
|
|
||||||
//
|
|
||||||
// val t = spark.read.load(targetPath).count()
|
|
||||||
//
|
|
||||||
// assertTrue(t > 0)
|
|
||||||
//
|
|
||||||
// spark.stop()
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
|
|
||||||
@Test
|
|
||||||
def testMapping(): Unit = {
|
|
||||||
val record = Source
|
|
||||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
|
|
||||||
.mkString
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
|
||||||
val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
|
|
||||||
|
|
||||||
res.foreach(r => {
|
|
||||||
println(mapper.writeValueAsString(r))
|
|
||||||
println("----------------------------")
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,62 +0,0 @@
|
||||||
https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
|
|
||||||
|
|
||||||
PED
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<value>/data/bioschema/ped/</value>
|
|
||||||
<description>the working path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapUrl</name>
|
|
||||||
<value>https://proteinensemble.org/sitemap2.xml.gz</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapURLKey</name>
|
|
||||||
<value>loc</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dynamic</name>
|
|
||||||
<value>true</value>
|
|
||||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
DISPROT
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<value>/data/bioschema/disprot/</value>
|
|
||||||
<description>the working path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapUrl</name>
|
|
||||||
<value>https://disprot.org/sitemap2.xml.gz</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapURLKey</name>
|
|
||||||
<value>loc</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dynamic</name>
|
|
||||||
<value>true</value>
|
|
||||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
MOBIDB
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<value>/data/bioschema/mobidb/</value>
|
|
||||||
<description>the working path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapUrl</name>
|
|
||||||
<value>https://mobidb.org/sitemap2.xml.gz</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sitemapURLKey</name>
|
|
||||||
<value>loc</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dynamic</name>
|
|
||||||
<value>true</value>
|
|
||||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
Loading…
Reference in New Issue