Add into ORCID workflow a method that extracts orcid directly to the dump generated by Enrico

This commit is contained in:
Sandro La Bruzzo 2021-04-13 17:47:43 +02:00
parent 1542196a33
commit 479abd10cb
7 changed files with 198 additions and 86 deletions

View File

@ -4,16 +4,23 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.dhp.schema.orcid.OrcidDOI import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){} case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
case class OrcidWork(oid:String, doi:String)
@ -46,8 +53,52 @@ object ORCIDToOAF {
} }
def convertTOOAF(input:OrcidDOI) :Publication = { def strValid(s:Option[String]) : Boolean = {
val doi = input.getDoi s.isDefined && s.get.nonEmpty
}
def authorValid(author:OrcidAuthor): Boolean ={
if (strValid(author.name) && strValid(author.surname)) {
return true
}
if (strValid(author.surname)) {
return true
}
if (strValid(author.creditName)) {
return true
}
false
}
def extractDOIWorks(input:String): List[OrcidWork] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val oid = (json \ "workDetail" \"oid").extract[String]
val doi:List[(String, String)] = for {
JObject(extIds) <- json \ "workDetail" \"extIds"
JField("type", JString(typeValue)) <- extIds
JField("value", JString(value)) <- extIds
if "doi".equalsIgnoreCase(typeValue)
} yield (typeValue, value)
if (doi.nonEmpty) {
return doi.map(l =>OrcidWork(oid, l._2))
}
List()
}
def convertORCIDAuthor(input:String): OrcidAuthor = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
(json \"authorData" ).extractOrElse[OrcidAuthor](null)
}
def convertTOOAF(input:ORCIDItem) :Publication = {
val doi = input.doi
val pub:Publication = new Publication val pub:Publication = new Publication
pub.setPid(List(createSP(doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava) pub.setPid(List(createSP(doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo()) pub.setDataInfo(generateDataInfo())
@ -58,8 +109,8 @@ object ORCIDToOAF {
try{ try{
val l:List[Author]= input.getAuthors.asScala.map(a=> { val l:List[Author]= input.authors.map(a=> {
generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid) generateAuthor(a)
})(collection.breakOut) })(collection.breakOut)
pub.setAuthor(l.asJava) pub.setAuthor(l.asJava)
@ -80,16 +131,20 @@ object ORCIDToOAF {
di di
} }
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = { def generateAuthor(o : OrcidAuthor): Author = {
val a = new Author val a = new Author
a.setName(given) if (strValid(o.name)) {
a.setSurname(family) a.setName(o.name.get.capitalize)
if (fullName!= null && fullName.nonEmpty) }
a.setFullname(fullName) if (strValid(o.surname)) {
else a.setSurname(o.surname.get.capitalize)
a.setFullname(s"$given $family") }
if (StringUtils.isNotBlank(orcid)) if(strValid(o.name) && strValid(o.surname))
a.setPid(List(createSP(orcid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava) a.setFullname(s"${o.name.get.capitalize} ${o.surname.get.capitalize}")
else if (strValid(o.creditName))
a.setFullname(o.creditName.get)
if (StringUtils.isNotBlank(o.oid))
a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
a a
} }

View File

@ -5,68 +5,48 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.dhp.schema.orcid.OrcidDOI import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF { object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{ def run(spark:SparkSession,sourcePath:String,workingPath:String, targetPath:String):Unit = {
import spark.implicits._
override def zero: Publication = new Publication()
override def reduce(b: Publication, a: (String, Publication)): Publication = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: Publication, wy: Publication): Publication = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: Publication): Publication = reduction
override def bufferEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
override def outputEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
}
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
val mapper = new ObjectMapper() val inputRDD:RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s))
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI]))) spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null)
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
works.joinWith(authors, authors("oid").equalTo(works("oid")))
.map(i =>{
val doi = i._1.doi
val author = i._2
(doi, author)
}).groupBy(col("_1").alias("doi"))
.agg(collect_list(col("_2")).alias("authors"))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
logger.info("Converting ORCID to OAF") logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null) dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
.map(d => (d.getId, d))
.groupByKey(_._1)(Encoders.STRING)
.agg(getPublicationAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(targetPath)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
parser.parseArgument(args) parser.parseArgument(args)
@ -78,10 +58,10 @@ object SparkConvertORCIDToOAF {
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
run(spark, sourcePath, targetPath) run(spark, sourcePath, workingPath, targetPath)
} }

View File

@ -1,5 +1,6 @@
[ [
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the Orcid Input file", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path ", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true} {"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}

View File

@ -74,6 +74,11 @@
<!-- ORCID Parameters --> <!-- ORCID Parameters -->
<property> <property>
<name>inputPathOrcid</name> <name>inputPathOrcid</name>
<description>the ORCID input path</description>
</property>
<property>
<name>workingPathOrcid</name>
<description>the ORCID working path</description> <description>the ORCID working path</description>
</property> </property>
@ -295,6 +300,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg> <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
<arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
<arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg> <arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>

View File

@ -2,12 +2,13 @@ package eu.dnetlib.doiboost.orcid
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.junit.jupiter.api.io.TempDir
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.Path
import scala.io.Source import scala.io.Source
class MappingORCIDToOAFTest { class MappingORCIDToOAFTest {
@ -24,27 +25,37 @@ class MappingORCIDToOAFTest {
}) })
} }
// @Test @Test
// def testOAFConvert():Unit ={ def testOAFConvert(@TempDir testDir: Path):Unit ={
// val sourcePath:String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
// val spark: SparkSession = val targetPath: String =s"${testDir.toString}/output/orcidPublication"
// SparkSession val workingPath =s"${testDir.toString}/wp/"
// .builder()
// .appName(getClass.getSimpleName) val spark: SparkSession =
// .master("local[*]").getOrCreate() SparkSession
// .builder()
// .appName(getClass.getSimpleName)
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf") .master("local[*]").getOrCreate()
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
// import spark.implicits._
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
// println(df.first.getId) SparkConvertORCIDToOAF.run( spark,sourcePath, workingPath, targetPath)
// println(mapper.writeValueAsString(df.first()))
// val mapper = new ObjectMapper()
//
//
//
// } val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
assertTrue(oA == p.count())
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p.first()))
}

File diff suppressed because one or more lines are too long