added normalization step for the doi

This commit is contained in:
Miriam Baglioni 2021-06-30 10:03:15 +02:00
parent 801763a0fa
commit cf758f4f91
3 changed files with 8 additions and 4 deletions

View File

@ -16,9 +16,10 @@ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable
import scala.util.matching.Regex import scala.util.matching.Regex
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
import java.util import java.util
import eu.dnetlib.doiboost.DoiBoostMappingUtil
case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {} case class mappingAffiliation(name: String) {}
@ -89,7 +90,7 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID //MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String] val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId //MAPPING Crossref DOI into OriginalId
@ -101,6 +102,7 @@ case object Crossref2Oaf {
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava) val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
result.setOriginalId(originalIds) result.setOriginalId(originalIds)
// Add DataInfo // Add DataInfo
result.setDataInfo(generateDataInfo()) result.setDataInfo(generateDataInfo())

View File

@ -1,6 +1,7 @@
package eu.dnetlib.doiboost.crossref package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text} import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
@ -21,7 +22,7 @@ object CrossrefDataset {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi:String = (json \ "DOI").extract[String] val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
CrossrefDT(doi, input, ts) CrossrefDT(doi, input, ts)
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.doiboost.crossref package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item
import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass
import org.apache.hadoop.io.{IntWritable, Text} import org.apache.hadoop.io.{IntWritable, Text}
@ -27,7 +28,7 @@ object GenerateCrossrefDataset {
def crossrefElement(meta: String): CrossrefDT = { def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta) lazy val json: json4s.JValue = parse(meta)
val doi:String = (json \ "DOI").extract[String] val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long]
CrossrefDT(doi, meta, timestamp) CrossrefDT(doi, meta, timestamp)