From cf758f4f91654745d08bc5dce7cd47ecd1334260 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 10:03:15 +0200 Subject: [PATCH] added normalization step for the doi --- .../java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 6 ++++-- .../java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala | 3 ++- .../dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 3f6a26c46..15a321431 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -16,9 +16,10 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex import eu.dnetlib.dhp.schema.scholexplorer.OafUtils - import java.util +import eu.dnetlib.doiboost.DoiBoostMappingUtil + case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class mappingAffiliation(name: String) {} @@ -89,7 +90,7 @@ case object Crossref2Oaf { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats //MAPPING Crossref DOI into PID - val doi: String = (json \ "DOI").extract[String] + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) //MAPPING Crossref DOI into OriginalId @@ -101,6 +102,7 @@ case object Crossref2Oaf { val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava) result.setOriginalId(originalIds) + // Add DataInfo result.setDataInfo(generateDataInfo()) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 4a39a2987..159b817c7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -1,6 +1,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf @@ -21,7 +22,7 @@ object CrossrefDataset { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] - val doi:String = (json \ "DOI").extract[String] + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) CrossrefDT(doi, input, ts) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index b11e2d8de..526ff7b3a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -1,6 +1,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass import org.apache.hadoop.io.{IntWritable, Text} @@ -27,7 +28,7 @@ object GenerateCrossrefDataset { def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) - val doi:String = (json \ "DOI").extract[String] + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] CrossrefDT(doi, meta, timestamp)