Moved cressref Utility to dhp-aggregation

2023-02-09 16:11:59 +01:00 · 2023-02-09 16:11:59 +01:00 · 04b12a35cd
parent 606cada7a4
commit 04b12a35cd
2 changed files with 379 additions and 0 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/CrossrefUtility.scala
@ -0,0 +1,357 @@
+package eu.dnetlib.dhp.crossref
+
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
+import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
+import eu.dnetlib.dhp.schema.oaf._
+import org.apache.commons.lang.StringUtils
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString, JValue}
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+
+
+case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
+object CrossrefUtility {
+  val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
+  val DOI_PREFIX = "10."
+  val CROSSREF_COLLECTED_FROM = keyValue(ModelConstants.CROSSREF_ID, ModelConstants.CROSSREF_NAME)
+
+  def normalizeDoi(input: String): String = {
+    if (input == null)
+      return null
+    val replaced = input
+      .replaceAll("(?:\\n|\\r|\\t|\\s)", "")
+      .toLowerCase
+      .replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
+    if (replaced == null || replaced.trim.isEmpty)
+      return null
+    if (replaced.indexOf("10.") < 0)
+      return null
+    val ret = replaced.substring(replaced.indexOf("10."))
+    if (!ret.startsWith(DOI_PREFIX))
+      return null
+    ret
+  }
+
+
+  def extractDate(dt: String, datePart: List[List[Int]]): String = {
+    if (StringUtils.isNotBlank(dt))
+      return GraphCleaningFunctions.cleanDate(dt)
+    if (datePart != null && datePart.size == 1) {
+      val res = datePart.head
+      if (res.size == 3) {
+        val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
+        if (dp.length == 10) {
+          return GraphCleaningFunctions.cleanDate(dp)
+        }
+      } else if (res.size == 2) {
+        val dp = f"${res.head}-${res(1)}%02d-01"
+        return GraphCleaningFunctions.cleanDate(dp)
+      } else if (res.size == 1) {
+        return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
+      }
+    }
+    null
+
+  }
+
+  private def generateDate(
+                    dt: String,
+                    datePart: List[List[Int]],
+                    classId: String,
+                    schemeId: String
+                  ): StructuredProperty = {
+    val dp = extractDate(dt, datePart)
+    if (StringUtils.isNotBlank(dp))
+      structuredProperty(dp, classId, classId,schemeId)
+    else
+      null
+  }
+
+
+  private def generateItemFromType(objectType: String, vocabularies:VocabularyGroup): (Result, String) = {
+    val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, objectType)
+    if (term != null) {
+      val resourceType = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, term.getClassid).getClassname
+
+      resourceType match {
+        case "publication" =>(new Publication, resourceType)
+        case "dataset" =>(new Dataset, resourceType)
+        case "software" => (new Software, resourceType)
+        case "otherresearchproduct" =>(new OtherResearchProduct, resourceType)
+      }
+    } else
+      null
+  }
+
+
+  def convert(input: String, vocabularies:VocabularyGroup): List[Oaf] = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: json4s.JValue = parse(input)
+
+    var resultList: List[Oaf] = List()
+
+    val objectType = (json \ "type").extractOrElse[String](null)
+    if (objectType == null)
+      return resultList
+
+    val resultWithType = generateItemFromType(objectType, vocabularies)
+    if (resultWithType == null)
+      return List()
+
+    val result = resultWithType._1
+    val cOBJCategory = resultWithType._2
+    mappingResult(result, json, cOBJCategory)
+    if (result == null || result.getId == null)
+      return List()
+
+    val funderList: List[mappingFunder] =
+      (json \ "funder").extractOrElse[List[mappingFunder]](List())
+
+    if (funderList.nonEmpty) {
+      resultList = resultList ::: mappingFunderToRelations(
+        funderList,
+        result.getId,
+        createCrossrefCollectedFrom(),
+        result.getDataInfo,
+        result.getLastupdatetimestamp
+      )
+    }
+
+    result match {
+      case publication: Publication => convertPublication(publication, json, cOBJCategory)
+      case dataset: Dataset => convertDataset(dataset)
+    }
+
+    resultList = resultList ::: List(result)
+    resultList
+  }
+
+
+  def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    //MAPPING Crossref DOI into PID
+    val doi: String = normalizeDoi((json \ "DOI").extract[String])
+
+    result.setPid(
+      List(
+        structuredProperty(doi, PidType.doi.toString, PidType.doi.toString,   ModelConstants.DNET_PID_TYPES)
+      ).asJava)
+
+    //MAPPING Crossref DOI into OriginalId
+    //and Other Original Identifier of dataset like clinical-trial-number
+    val clinicalTrialNumbers: List[String] = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr
+    val alternativeIds: List[String] = for (JString(ids) <- json \ "alternative-id") yield ids
+    val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
+
+
+    result.setOriginalId(tmp.filter(id => id != null).asJava)
+
+    // Add DataInfo
+    result.setDataInfo(dataInfo(false, false,0.9F,null, false,ModelConstants.REPOSITORY_PROVENANCE_ACTIONS))
+
+    result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
+    result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
+
+    result.setCollectedfrom(List(CROSSREF_COLLECTED_FROM).asJava)
+
+    // Publisher ( Name of work's publisher mapped into  Result/Publisher)
+    val publisher = (json \ "publisher").extractOrElse[String](null)
+    if (publisher != null && publisher.nonEmpty)
+      result.setPublisher(new Publisher(publisher))
+
+    // TITLE
+    val mainTitles =
+      for {JString(title) <- json \ "title" if title.nonEmpty}
+        yield
+          structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER)
+    val originalTitles = for {
+      JString(title) <- json \ "original-title" if title.nonEmpty
+    } yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
+    val shortTitles = for {
+      JString(title) <- json \ "short-title" if title.nonEmpty
+    }  yield structuredProperty(title, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER)
+    val subtitles =
+      for {JString(title) <- json \ "subtitle" if title.nonEmpty}
+        yield structuredProperty(title, ModelConstants.SUBTITLE_QUALIFIER)
+    result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
+
+    // DESCRIPTION
+    val descriptionList =
+      for {JString(description) <- json \ "abstract"} yield description
+    result.setDescription(descriptionList.asJava)
+
+    // Source
+    val sourceList = for {
+      JString(source) <- json \ "source" if source != null && source.nonEmpty
+    } yield source
+    result.setSource(sourceList.asJava)
+
+    //RELEVANT DATE Mapping
+    val createdDate = generateDate(
+      (json \ "created" \ "date-time").extract[String],
+      (json \ "created" \ "date-parts").extract[List[List[Int]]],
+      "created",
+      ModelConstants.DNET_DATACITE_DATE
+    )
+    val postedDate = generateDate(
+      (json \ "posted" \ "date-time").extractOrElse[String](null),
+      (json \ "posted" \ "date-parts").extract[List[List[Int]]],
+      "available",
+      ModelConstants.DNET_DATACITE_DATE
+    )
+    val acceptedDate = generateDate(
+      (json \ "accepted" \ "date-time").extractOrElse[String](null),
+      (json \ "accepted" \ "date-parts").extract[List[List[Int]]],
+      "accepted",
+      ModelConstants.DNET_DATACITE_DATE
+    )
+    val publishedPrintDate = generateDate(
+      (json \ "published-print" \ "date-time").extractOrElse[String](null),
+      (json \ "published-print" \ "date-parts").extract[List[List[Int]]],
+      "published-print",
+      ModelConstants.DNET_DATACITE_DATE
+    )
+    val publishedOnlineDate = generateDate(
+      (json \ "published-online" \ "date-time").extractOrElse[String](null),
+      (json \ "published-online" \ "date-parts").extract[List[List[Int]]],
+      "published-online",
+      ModelConstants.DNET_DATACITE_DATE
+    )
+
+    val issuedDate = extractDate(
+      (json \ "issued" \ "date-time").extractOrElse[String](null),
+      (json \ "issued" \ "date-parts").extract[List[List[Int]]]
+    )
+    if (StringUtils.isNotBlank(issuedDate)) {
+      result.setDateofacceptance(issuedDate)
+    } else {
+      result.setDateofacceptance(createdDate.getValue)
+    }
+    result.setRelevantdate(
+      List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
+        .filter(p => p != null)
+        .asJava
+    )
+
+    //Mapping Subject
+    val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
+
+
+
+    if (subjectList.nonEmpty) {
+      result.setSubject(
+        subjectList.map(s =>  createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
+      )
+    }
+
+    //Mapping Author
+    val authorList: List[mappingAuthor] =
+      (json \ "author").extractOrElse[List[mappingAuthor]](List())
+
+    val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
+      a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
+    )
+
+    result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
+      generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
+    }.asJava)
+
+    // Mapping instance
+    val instance = new Instance()
+    val license = for {
+      JObject(license) <- json \ "license"
+      JField("URL", JString(lic)) <- license
+      JField("content-version", JString(content_version)) <- license
+    } yield (asField(lic), content_version)
+    val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))
+    if (l.nonEmpty) {
+      if (l exists (d => d._2.equals("vor"))) {
+        for (d <- l) {
+          if (d._2.equals("vor")) {
+            instance.setLicense(d._1)
+          }
+        }
+      } else {
+        instance.setLicense(l.head._1)
+      }
+    }
+
+    // Ticket #6281 added pid to Instance
+    instance.setPid(result.getPid)
+
+    val has_review = json \ "relation" \ "has-review" \ "id"
+
+    if (has_review != JNothing) {
+      instance.setRefereed(
+        OafMapperUtils.qualifier(
+          "0001",
+          "peerReviewed",
+          ModelConstants.DNET_REVIEW_LEVELS,
+          ModelConstants.DNET_REVIEW_LEVELS
+        )
+      )
+    }
+
+    instance.setAccessright(
+      decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
+    )
+    instance.setInstancetype(
+      OafMapperUtils.qualifier(
+        cobjCategory.substring(0, 4),
+        cobjCategory.substring(5),
+        ModelConstants.DNET_PUBLICATION_RESOURCE,
+        ModelConstants.DNET_PUBLICATION_RESOURCE
+      )
+    )
+    result.setResourcetype(
+      OafMapperUtils.qualifier(
+        cobjCategory.substring(0, 4),
+        cobjCategory.substring(5),
+        ModelConstants.DNET_PUBLICATION_RESOURCE,
+        ModelConstants.DNET_PUBLICATION_RESOURCE
+      )
+    )
+
+    instance.setCollectedfrom(createCrossrefCollectedFrom())
+    if (StringUtils.isNotBlank(issuedDate)) {
+      instance.setDateofacceptance(asField(issuedDate))
+    } else {
+      instance.setDateofacceptance(asField(createdDate.getValue))
+    }
+    val s: List[String] = List("https://doi.org/" + doi)
+    //    val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
+    //    if (links.nonEmpty) {
+    //      instance.setUrl(links.asJava)
+    //    }
+    if (s.nonEmpty) {
+      instance.setUrl(s.asJava)
+    }
+
+    result.setInstance(List(instance).asJava)
+
+    //IMPORTANT
+    //The old method result.setId(generateIdentifier(result, doi))
+    //is replaced using IdentifierFactory, but the old identifier
+    //is preserved among the originalId(s)
+    val oldId = generateIdentifier(result, doi)
+    result.setId(oldId)
+
+    val newId = IdentifierFactory.createDOIBoostIdentifier(result)
+    if (!oldId.equalsIgnoreCase(newId)) {
+      result.getOriginalId.add(oldId)
+    }
+    result.setId(newId)
+
+    if (result.getId == null)
+      null
+    else
+      result
+  }
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/crossref/GenerateCrossrefDataset.scala
@ -0,0 +1,22 @@
+package eu.dnetlib.dhp.crossref
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import org.slf4j.{Logger, LoggerFactory}
+
+class GenerateCrossrefDataset (propertyPath: String, args: Array[String], log: Logger)
+  extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+  /** Here all the spark applications runs this method
+   * where the whole logic of the spark node is defined
+   */
+  override def run(): Unit = ???
+}
+
+
+object GenerateCrossrefDataset{
+  val log:Logger = LoggerFactory.getLogger(getClass)
+  val propertyPath ="/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
+
+  def main(args: Array[String]): Unit = {
+    new GenerateCrossrefDataset(propertyPath,args, log).initialize().run()
+  }
+}