From 04b12a35cd514d60e47d8e89814e1016ae62f966 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 9 Feb 2023 16:11:59 +0100 Subject: [PATCH] Moved cressref Utility to dhp-aggregation --- .../dhp/crossref/CrossrefUtility.scala | 357 ++++++++++++++++++ .../crossref/GenerateCrossrefDataset.scala | 22 ++ 2 files changed, 379 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala new file mode 100644 index 000000000..28ea64c9b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala @@ -0,0 +1,357 @@ +package eu.dnetlib.dhp.crossref + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._ +import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} +import eu.dnetlib.dhp.schema.oaf._ +import org.apache.commons.lang.StringUtils +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString, JValue} +import org.json4s.jackson.JsonMethods.parse + +import scala.collection.JavaConverters._ + + +case class CrossrefDT(doi: String, json: String, timestamp: Long) {} +object CrossrefUtility { + val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" + val DOI_PREFIX = "10." + val CROSSREF_COLLECTED_FROM = keyValue(ModelConstants.CROSSREF_ID, ModelConstants.CROSSREF_NAME) + + def normalizeDoi(input: String): String = { + if (input == null) + return null + val replaced = input + .replaceAll("(?:\\n|\\r|\\t|\\s)", "") + .toLowerCase + .replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) + if (replaced == null || replaced.trim.isEmpty) + return null + if (replaced.indexOf("10.") < 0) + return null + val ret = replaced.substring(replaced.indexOf("10.")) + if (!ret.startsWith(DOI_PREFIX)) + return null + ret + } + + + def extractDate(dt: String, datePart: List[List[Int]]): String = { + if (StringUtils.isNotBlank(dt)) + return GraphCleaningFunctions.cleanDate(dt) + if (datePart != null && datePart.size == 1) { + val res = datePart.head + if (res.size == 3) { + val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d" + if (dp.length == 10) { + return GraphCleaningFunctions.cleanDate(dp) + } + } else if (res.size == 2) { + val dp = f"${res.head}-${res(1)}%02d-01" + return GraphCleaningFunctions.cleanDate(dp) + } else if (res.size == 1) { + return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01") + } + } + null + + } + + private def generateDate( + dt: String, + datePart: List[List[Int]], + classId: String, + schemeId: String + ): StructuredProperty = { + val dp = extractDate(dt, datePart) + if (StringUtils.isNotBlank(dp)) + structuredProperty(dp, classId, classId,schemeId) + else + null + } + + + private def generateItemFromType(objectType: String, vocabularies:VocabularyGroup): (Result, String) = { + val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType) + if (term != null) { + val resourceType = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname + + resourceType match { + case "publication" =>(new Publication, resourceType) + case "dataset" =>(new Dataset, resourceType) + case "software" => (new Software, resourceType) + case "otherresearchproduct" =>(new OtherResearchProduct, resourceType) + } + } else + null + } + + + def convert(input: String, vocabularies:VocabularyGroup): List[Oaf] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + + var resultList: List[Oaf] = List() + + val objectType = (json \ "type").extractOrElse[String](null) + if (objectType == null) + return resultList + + val resultWithType = generateItemFromType(objectType, vocabularies) + if (resultWithType == null) + return List() + + val result = resultWithType._1 + val cOBJCategory = resultWithType._2 + mappingResult(result, json, cOBJCategory) + if (result == null || result.getId == null) + return List() + + val funderList: List[mappingFunder] = + (json \ "funder").extractOrElse[List[mappingFunder]](List()) + + if (funderList.nonEmpty) { + resultList = resultList ::: mappingFunderToRelations( + funderList, + result.getId, + createCrossrefCollectedFrom(), + result.getDataInfo, + result.getLastupdatetimestamp + ) + } + + result match { + case publication: Publication => convertPublication(publication, json, cOBJCategory) + case dataset: Dataset => convertDataset(dataset) + } + + resultList = resultList ::: List(result) + resultList + } + + + def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + + //MAPPING Crossref DOI into PID + val doi: String = normalizeDoi((json \ "DOI").extract[String]) + + result.setPid( + List( + structuredProperty(doi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES) + ).asJava) + + //MAPPING Crossref DOI into OriginalId + //and Other Original Identifier of dataset like clinical-trial-number + val clinicalTrialNumbers: List[String] = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr + val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids + val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi) + + + result.setOriginalId(tmp.filter(id => id != null).asJava) + + // Add DataInfo + result.setDataInfo(dataInfo(false, false,0.9F,null, false,ModelConstants.REPOSITORY_PROVENANCE_ACTIONS)) + + result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long]) + result.setDateofcollection((json \ "indexed" \ "date-time").extract[String]) + + result.setCollectedfrom(List(CROSSREF_COLLECTED_FROM).asJava) + + // Publisher ( Name of work's publisher mapped into Result/Publisher) + val publisher = (json \ "publisher").extractOrElse[String](null) + if (publisher != null && publisher.nonEmpty) + result.setPublisher(new Publisher(publisher)) + + // TITLE + val mainTitles = + for {JString(title) <- json \ "title" if title.nonEmpty} + yield + structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER) + val originalTitles = for { + JString(title) <- json \ "original-title" if title.nonEmpty + } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) + val shortTitles = for { + JString(title) <- json \ "short-title" if title.nonEmpty + } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER) + val subtitles = + for {JString(title) <- json \ "subtitle" if title.nonEmpty} + yield structuredProperty(title, ModelConstants.SUBTITLE_QUALIFIER) + result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) + + // DESCRIPTION + val descriptionList = + for {JString(description) <- json \ "abstract"} yield description + result.setDescription(descriptionList.asJava) + + // Source + val sourceList = for { + JString(source) <- json \ "source" if source != null && source.nonEmpty + } yield source + result.setSource(sourceList.asJava) + + //RELEVANT DATE Mapping + val createdDate = generateDate( + (json \ "created" \ "date-time").extract[String], + (json \ "created" \ "date-parts").extract[List[List[Int]]], + "created", + ModelConstants.DNET_DATACITE_DATE + ) + val postedDate = generateDate( + (json \ "posted" \ "date-time").extractOrElse[String](null), + (json \ "posted" \ "date-parts").extract[List[List[Int]]], + "available", + ModelConstants.DNET_DATACITE_DATE + ) + val acceptedDate = generateDate( + (json \ "accepted" \ "date-time").extractOrElse[String](null), + (json \ "accepted" \ "date-parts").extract[List[List[Int]]], + "accepted", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedPrintDate = generateDate( + (json \ "published-print" \ "date-time").extractOrElse[String](null), + (json \ "published-print" \ "date-parts").extract[List[List[Int]]], + "published-print", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedOnlineDate = generateDate( + (json \ "published-online" \ "date-time").extractOrElse[String](null), + (json \ "published-online" \ "date-parts").extract[List[List[Int]]], + "published-online", + ModelConstants.DNET_DATACITE_DATE + ) + + val issuedDate = extractDate( + (json \ "issued" \ "date-time").extractOrElse[String](null), + (json \ "issued" \ "date-parts").extract[List[List[Int]]] + ) + if (StringUtils.isNotBlank(issuedDate)) { + result.setDateofacceptance(issuedDate) + } else { + result.setDateofacceptance(createdDate.getValue) + } + result.setRelevantdate( + List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate) + .filter(p => p != null) + .asJava + ) + + //Mapping Subject + val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List()) + + + + if (subjectList.nonEmpty) { + result.setSubject( + subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava + ) + } + + //Mapping Author + val authorList: List[mappingAuthor] = + (json \ "author").extractOrElse[List[mappingAuthor]](List()) + + val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) => + a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first") + ) + + result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) => + generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index) + }.asJava) + + // Mapping instance + val instance = new Instance() + val license = for { + JObject(license) <- json \ "license" + JField("URL", JString(lic)) <- license + JField("content-version", JString(content_version)) <- license + } yield (asField(lic), content_version) + val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue)) + if (l.nonEmpty) { + if (l exists (d => d._2.equals("vor"))) { + for (d <- l) { + if (d._2.equals("vor")) { + instance.setLicense(d._1) + } + } + } else { + instance.setLicense(l.head._1) + } + } + + // Ticket #6281 added pid to Instance + instance.setPid(result.getPid) + + val has_review = json \ "relation" \ "has-review" \ "id" + + if (has_review != JNothing) { + instance.setRefereed( + OafMapperUtils.qualifier( + "0001", + "peerReviewed", + ModelConstants.DNET_REVIEW_LEVELS, + ModelConstants.DNET_REVIEW_LEVELS + ) + ) + } + + instance.setAccessright( + decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue) + ) + instance.setInstancetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + result.setResourcetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + instance.setCollectedfrom(createCrossrefCollectedFrom()) + if (StringUtils.isNotBlank(issuedDate)) { + instance.setDateofacceptance(asField(issuedDate)) + } else { + instance.setDateofacceptance(asField(createdDate.getValue)) + } + val s: List[String] = List("https://doi.org/" + doi) + // val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct + // if (links.nonEmpty) { + // instance.setUrl(links.asJava) + // } + if (s.nonEmpty) { + instance.setUrl(s.asJava) + } + + result.setInstance(List(instance).asJava) + + //IMPORTANT + //The old method result.setId(generateIdentifier(result, doi)) + //is replaced using IdentifierFactory, but the old identifier + //is preserved among the originalId(s) + val oldId = generateIdentifier(result, doi) + result.setId(oldId) + + val newId = IdentifierFactory.createDOIBoostIdentifier(result) + if (!oldId.equalsIgnoreCase(newId)) { + result.getOriginalId.add(oldId) + } + result.setId(newId) + + if (result.getId == null) + null + else + result + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala new file mode 100644 index 000000000..fd4bcd37d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala @@ -0,0 +1,22 @@ +package eu.dnetlib.dhp.crossref + +import eu.dnetlib.dhp.application.AbstractScalaApplication +import org.slf4j.{Logger, LoggerFactory} + +class GenerateCrossrefDataset (propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = ??? +} + + +object GenerateCrossrefDataset{ + val log:Logger = LoggerFactory.getLogger(getClass) + val propertyPath ="/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json" + + def main(args: Array[String]): Unit = { + new GenerateCrossrefDataset(propertyPath,args, log).initialize().run() + } +}