2021-11-25 10:54:13 +01:00
|
|
|
package eu.dnetlib.dhp.datacite
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
2023-02-01 16:24:35 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.{DataInfo, EntityDataInfo, KeyValue}
|
2021-11-25 10:54:13 +01:00
|
|
|
|
|
|
|
import java.io.InputStream
|
|
|
|
import java.time.format.DateTimeFormatter
|
|
|
|
import java.util.Locale
|
|
|
|
import java.util.regex.Pattern
|
|
|
|
import scala.io.Source
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
/** This class represent the dataModel of the input Dataset of Datacite
|
|
|
|
* @param doi THE DOI
|
|
|
|
* @param timestamp timestamp of last update date
|
|
|
|
* @param isActive the record is active or deleted
|
|
|
|
* @param json the json native records
|
|
|
|
*/
|
2021-11-25 10:54:13 +01:00
|
|
|
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
|
|
|
|
|
|
|
/*
|
|
|
|
The following class are utility class used for the mapping from
|
|
|
|
json datacite to OAF Shema
|
|
|
|
*/
|
2022-01-11 16:57:48 +01:00
|
|
|
case class RelatedIdentifierType(
|
|
|
|
relationType: String,
|
|
|
|
relatedIdentifier: String,
|
|
|
|
relatedIdentifierType: String
|
|
|
|
) {}
|
|
|
|
|
|
|
|
case class NameIdentifiersType(
|
|
|
|
nameIdentifierScheme: Option[String],
|
|
|
|
schemeUri: Option[String],
|
|
|
|
nameIdentifier: Option[String]
|
|
|
|
) {}
|
|
|
|
|
|
|
|
case class CreatorType(
|
|
|
|
nameType: Option[String],
|
|
|
|
nameIdentifiers: Option[List[NameIdentifiersType]],
|
|
|
|
name: Option[String],
|
|
|
|
familyName: Option[String],
|
|
|
|
givenName: Option[String],
|
|
|
|
affiliation: Option[List[String]]
|
|
|
|
) {}
|
2021-11-25 10:54:13 +01:00
|
|
|
|
|
|
|
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
|
|
|
|
|
|
|
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
|
|
|
|
|
|
|
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class FundingReferenceType(
|
|
|
|
funderIdentifierType: Option[String],
|
|
|
|
awardTitle: Option[String],
|
|
|
|
awardUri: Option[String],
|
|
|
|
funderName: Option[String],
|
|
|
|
funderIdentifier: Option[String],
|
|
|
|
awardNumber: Option[String]
|
|
|
|
) {}
|
2021-11-25 10:54:13 +01:00
|
|
|
|
|
|
|
case class DateType(date: Option[String], dateType: Option[String]) {}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class OAFRelations(relation: String, inverse: String, relType: String)
|
2021-11-25 10:54:13 +01:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
class DataciteModelConstants extends Serializable {}
|
2021-11-25 10:54:13 +01:00
|
|
|
|
|
|
|
object DataciteModelConstants {
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val REL_TYPE_VALUE: String = "resultResult"
|
2021-11-25 10:54:13 +01:00
|
|
|
val DATE_RELATION_KEY = "RelationDate"
|
|
|
|
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
|
|
|
val DOI_CLASS = "doi"
|
|
|
|
val SUBJ_CLASS = "keywords"
|
|
|
|
val DATACITE_NAME = "Datacite"
|
2023-02-01 16:24:35 +01:00
|
|
|
val dataInfo: EntityDataInfo = dataciteDataInfo(0.9f)
|
2021-11-25 10:54:13 +01:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val DATACITE_COLLECTED_FROM: KeyValue =
|
|
|
|
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
|
|
|
|
|
|
|
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
|
|
|
ModelConstants.REFERENCES -> OAFRelations(
|
|
|
|
ModelConstants.REFERENCES,
|
|
|
|
ModelConstants.IS_REFERENCED_BY,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_REFERENCED_BY,
|
|
|
|
ModelConstants.REFERENCES,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
|
|
ModelConstants.IS_SUPPLEMENT_TO,
|
|
|
|
ModelConstants.SUPPLEMENT
|
|
|
|
),
|
|
|
|
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
|
|
|
ModelConstants.IS_SUPPLEMENT_TO,
|
|
|
|
ModelConstants.IS_SUPPLEMENTED_BY,
|
|
|
|
ModelConstants.SUPPLEMENT
|
|
|
|
),
|
|
|
|
ModelConstants.HAS_PART -> OAFRelations(
|
|
|
|
ModelConstants.HAS_PART,
|
|
|
|
ModelConstants.IS_PART_OF,
|
|
|
|
ModelConstants.PART
|
|
|
|
),
|
|
|
|
ModelConstants.IS_PART_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_PART_OF,
|
|
|
|
ModelConstants.HAS_PART,
|
|
|
|
ModelConstants.PART
|
|
|
|
),
|
|
|
|
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_VERSION_OF,
|
|
|
|
ModelConstants.HAS_VERSION,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.HAS_VERSION -> OAFRelations(
|
|
|
|
ModelConstants.HAS_VERSION,
|
|
|
|
ModelConstants.IS_VERSION_OF,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
|
|
|
ModelConstants.IS_IDENTICAL_TO,
|
|
|
|
ModelConstants.IS_IDENTICAL_TO,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_CONTINUED_BY,
|
|
|
|
ModelConstants.CONTINUES,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.CONTINUES -> OAFRelations(
|
|
|
|
ModelConstants.CONTINUES,
|
|
|
|
ModelConstants.IS_CONTINUED_BY,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_NEW_VERSION_OF,
|
|
|
|
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
|
|
|
ModelConstants.IS_NEW_VERSION_OF,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY,
|
|
|
|
ModelConstants.DOCUMENTS,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
|
|
ModelConstants.DOCUMENTS,
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_SOURCE_OF,
|
|
|
|
ModelConstants.IS_DERIVED_FROM,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
|
|
|
ModelConstants.IS_DERIVED_FROM,
|
|
|
|
ModelConstants.IS_SOURCE_OF,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.CITES -> OAFRelations(
|
|
|
|
ModelConstants.CITES,
|
|
|
|
ModelConstants.IS_CITED_BY,
|
|
|
|
ModelConstants.CITATION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_CITED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_CITED_BY,
|
|
|
|
ModelConstants.CITES,
|
|
|
|
ModelConstants.CITATION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
|
|
|
ModelConstants.IS_VARIANT_FORM_OF,
|
|
|
|
ModelConstants.IS_DERIVED_FROM,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_OBSOLETED_BY,
|
|
|
|
ModelConstants.IS_NEW_VERSION_OF,
|
|
|
|
ModelConstants.VERSION
|
|
|
|
),
|
|
|
|
ModelConstants.REVIEWS -> OAFRelations(
|
|
|
|
ModelConstants.REVIEWS,
|
|
|
|
ModelConstants.IS_REVIEWED_BY,
|
|
|
|
ModelConstants.REVIEW
|
|
|
|
),
|
|
|
|
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_REVIEWED_BY,
|
|
|
|
ModelConstants.REVIEWS,
|
|
|
|
ModelConstants.REVIEW
|
|
|
|
),
|
|
|
|
ModelConstants.DOCUMENTS -> OAFRelations(
|
|
|
|
ModelConstants.DOCUMENTS,
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_DOCUMENTED_BY,
|
|
|
|
ModelConstants.DOCUMENTS,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.COMPILES -> OAFRelations(
|
|
|
|
ModelConstants.COMPILES,
|
|
|
|
ModelConstants.IS_COMPILED_BY,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
),
|
|
|
|
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
|
|
|
ModelConstants.IS_COMPILED_BY,
|
|
|
|
ModelConstants.COMPILES,
|
|
|
|
ModelConstants.RELATIONSHIP
|
|
|
|
)
|
2021-11-25 10:54:13 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
val datacite_filter: List[String] = {
|
|
|
|
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
2022-01-11 16:57:48 +01:00
|
|
|
require(stream != null)
|
2021-11-25 10:54:13 +01:00
|
|
|
Source.fromInputStream(stream).getLines().toList
|
|
|
|
}
|
|
|
|
|
2023-02-01 16:24:35 +01:00
|
|
|
def dataciteDataInfo(trust: Float): EntityDataInfo = OafMapperUtils.dataInfo(
|
2022-01-11 16:57:48 +01:00
|
|
|
false,
|
|
|
|
false,
|
2023-02-01 16:24:35 +01:00
|
|
|
trust,
|
|
|
|
null,
|
2022-01-11 16:57:48 +01:00
|
|
|
false,
|
2023-02-01 16:24:35 +01:00
|
|
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER
|
2022-01-11 16:57:48 +01:00
|
|
|
)
|
2021-11-25 10:54:13 +01:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
|
|
|
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
|
|
|
Locale.ENGLISH
|
|
|
|
)
|
2021-11-25 10:54:13 +01:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val df_it: DateTimeFormatter =
|
|
|
|
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
2021-11-25 10:54:13 +01:00
|
|
|
|
|
|
|
val funder_regex: List[(Pattern, String)] = List(
|
2022-01-11 16:57:48 +01:00
|
|
|
(
|
|
|
|
Pattern.compile(
|
|
|
|
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
|
|
|
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
|
|
|
),
|
|
|
|
"40|corda__h2020::"
|
|
|
|
),
|
|
|
|
(
|
|
|
|
Pattern.compile(
|
|
|
|
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
|
|
|
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
|
|
|
),
|
|
|
|
"40|corda_______::"
|
|
|
|
)
|
2021-11-25 10:54:13 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
val Date_regex: List[Pattern] = List(
|
|
|
|
//Y-M-D
|
2022-01-11 16:57:48 +01:00
|
|
|
Pattern.compile(
|
|
|
|
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
|
|
|
Pattern.MULTILINE
|
|
|
|
),
|
2021-11-25 10:54:13 +01:00
|
|
|
//M-D-Y
|
2022-01-11 16:57:48 +01:00
|
|
|
Pattern.compile(
|
|
|
|
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
|
|
|
Pattern.MULTILINE
|
|
|
|
),
|
2021-11-25 10:54:13 +01:00
|
|
|
//D-M-Y
|
2022-01-11 16:57:48 +01:00
|
|
|
Pattern.compile(
|
|
|
|
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
|
|
|
Pattern.MULTILINE
|
|
|
|
),
|
2021-11-25 10:54:13 +01:00
|
|
|
//Y
|
|
|
|
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
|
|
|
)
|
|
|
|
|
|
|
|
}
|