enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
12 changed files with 201 additions and 151 deletions
Showing only changes of commit 7ac1ba2e35 - Show all commits

View File

@ -1,7 +1,7 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper import org.codehaus.jackson.map.ObjectMapper
@ -14,11 +14,21 @@ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){} case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
object DoiBoostMappingUtil { object DoiBoostMappingUtil {
def getUnknownCountry(): Qualifier = {
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
}
def generateMAGAffiliationId(affId: String): String = {
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
}
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
@ -52,6 +62,11 @@ object DoiBoostMappingUtil {
a.setClazz(classOf[Publication]) a.setClazz(classOf[Publication])
a.setPayload(publication) a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case organization: Organization =>
val a: AtomicAction[Organization] = new AtomicAction[Organization]
a.setClazz(classOf[Organization])
a.setPayload(organization)
(organization.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation => case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation] val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation]) a.setClazz(classOf[Relation])
@ -64,27 +79,33 @@ object DoiBoostMappingUtil {
} }
def retrieveHostedByMap(): Map[String, HostedByItemType] = { def toHostedByItem(input:String): (String, HostedByItemType) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val jsonMap = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/hbMap.json")).mkString
lazy val json: json4s.JValue = parse(jsonMap) lazy val json: json4s.JValue = parse(input)
json.extract[Map[String, HostedByItemType]] val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
(c.keys.head, c.values.head)
} }
def retrieveHostedByItem(issn: String, eissn: String, lissn: String, hostedByMap: Map[String, HostedByItemType]): HostedByItemType = {
if (issn != null && issn.nonEmpty && hostedByMap.contains(issn))
return hostedByMap(issn)
if (eissn != null && eissn.nonEmpty && hostedByMap.contains(eissn)) def toISSNPair(publication: Publication) : (String, Publication) = {
return hostedByMap(eissn) val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
if (lissn != null && lissn.nonEmpty && hostedByMap.contains(lissn)) val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
return hostedByMap(lissn)
null
if (issn!= null && issn.nonEmpty)
(issn, publication)
else if(eissn!= null && eissn.nonEmpty)
(eissn, publication)
else if(lissn!= null && lissn.nonEmpty)
(lissn, publication)
else
(publication.getId, publication)
} }
def generateGridAffiliationId(gridId:String) :String = { def generateGridAffiliationId(gridId:String) :String = {
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}" s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
} }
@ -96,18 +117,36 @@ object DoiBoostMappingUtil {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
} }
result.getInstance().asScala.foreach(i => { result.getInstance().asScala.foreach(i => {
val hb = new KeyValue i.setHostedby(getUbknownHostedBy())
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
i.setHostedby(hb)
}) })
result result
} }
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = { def getUbknownHostedBy():KeyValue = {
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted val hb = new KeyValue
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline hb.setValue("Unknown Repository")
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
hb
}
def getOpenAccessQualifier():Qualifier = {
createQualifier("OPEN","Open Access","dnet:access_modes", "dnet:access_modes")
}
def getRestrictedQualifier():Qualifier = {
createQualifier("RESTRICTED","Restricted","dnet:access_modes", "dnet:access_modes")
}
def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = {
val publication = input._1._2
val item = if (input._2 != null) input._2._2 else null
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
@ -115,15 +154,15 @@ object DoiBoostMappingUtil {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
} }
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
publication.getInstance().asScala.foreach(i => { publication.getInstance().asScala.foreach(i => {
val hb = new KeyValue val hb = new KeyValue
if (item != null) { if (item != null) {
hb.setValue(item.officialName) hb.setValue(item.officialname)
hb.setKey(generateDSId(item.id)) hb.setKey(generateDSId(item.id))
if (item.openAccess) if (item.openAccess)
i.setAccessright(createQualifier("OPEN", "dnet:access_modes")) i.setAccessright(getOpenAccessQualifier())
publication.setBestaccessright(createQualifier("OPEN", "dnet:access_modes")) publication.setBestaccessright(getOpenAccessQualifier())
} }
else { else {
hb.setValue("Unknown Repository") hb.setValue("Unknown Repository")
@ -135,10 +174,10 @@ object DoiBoostMappingUtil {
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid) val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
if (ar.nonEmpty) { if (ar.nonEmpty) {
if(ar.contains("OPEN")){ if(ar.contains("OPEN")){
publication.setBestaccessright(createQualifier("OPEN", "dnet:access_modes")) publication.setBestaccessright(getOpenAccessQualifier())
} }
else { else {
publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes")) publication.setBestaccessright(getRestrictedQualifier())
} }
} }
publication publication
@ -298,6 +337,8 @@ object DoiBoostMappingUtil {
} }
def createMAGCollectedFrom(): KeyValue = { def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue val cf = new KeyValue

View File

@ -2,10 +2,13 @@ package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{ Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet { object SparkGenerateDOIBoostActionSet {
@ -23,8 +26,10 @@ object SparkGenerateDOIBoostActionSet {
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]] implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
@ -32,17 +37,23 @@ object SparkGenerateDOIBoostActionSet {
val dbDatasetPath = parser.get("dbDatasetPath") val dbDatasetPath = parser.get("dbDatasetPath")
val crossRefRelation = parser.get("crossRefRelation") val crossRefRelation = parser.get("crossRefRelation")
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
val dbOrganizationPath = parser.get("dbOrganizationPath")
val workingDirPath = parser.get("targetPath") val workingDirPath = parser.get("targetPath")
spark.read.load(dbDatasetPath).as[OafDataset] spark.read.load(dbDatasetPath).as[OafDataset]
.map(d =>DoiBoostMappingUtil.fixResult(d)) .map(d =>DoiBoostMappingUtil.fixResult(d))
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
spark.read.load(dbPublicationPath).as[Publication] spark.read.load(dbPublicationPath).as[Publication]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(dbOrganizationPath).as[Organization]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(crossRefRelation).as[Relation] spark.read.load(crossRefRelation).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
@ -52,15 +63,9 @@ object SparkGenerateDOIBoostActionSet {
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
// implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING) val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
//
// val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
//
//
//
// d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])

View File

@ -1,7 +1,7 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization}
import eu.dnetlib.doiboost.mag.ConversionUtil import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
@ -26,22 +26,13 @@ object SparkGenerateDoiBoost {
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
import spark.implicits._ import spark.implicits._
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
val crossrefDatasetPath = parser.get("crossrefDatasetPath") val hostedByMapPath = parser.get("hostedByMapPath")
val uwPublicationPath = parser.get("uwPublicationPath")
val magPublicationPath = parser.get("magPublicationPath")
val orcidPublicationPath = parser.get("orcidPublicationPath")
val workingDirPath = parser.get("workingDirPath") val workingDirPath = parser.get("workingDirPath")
// logger.info("Phase 1) repartition and move all the dataset in a same working folder")
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
@ -75,23 +66,26 @@ object SparkGenerateDoiBoost {
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication") sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication] val doiBoostPublication: Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val map = DoiBoostMappingUtil.retrieveHostedByMap() val hostedByDataset : Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
.map(DoiBoostMappingUtil.fixPublication)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath") val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath") val paperAffiliationPath = parser.get("paperAffiliationPath")
val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId")) val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId")) val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
val a:Dataset[DoiBoostAffiliation] = paperAffiliation val a:Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation] .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
@ -102,24 +96,45 @@ object SparkGenerateDoiBoost {
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => { magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub:Publication = item._1._2 val pub:Publication = item._1._2
val affiliation = item._2 val affiliation = item._2
val affId:String = if (affiliation.GridId.isDefined) DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId.get) else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r:Relation = new Relation val r:Relation = new Relation
r.setSource(pub.getId) r.setSource(pub.getId)
r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId)) r.setTarget(affId)
r.setRelType("resultOrganization") r.setRelType("resultOrganization")
r.setRelClass("hasAuthorInstitution") r.setRelClass("hasAuthorInstitution")
r.setSubRelType("affiliation") r.setSubRelType("affiliation")
r.setDataInfo(pub.getDataInfo) r.setDataInfo(pub.getDataInfo)
r.setCollectedfrom(pub.getCollectedfrom) r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
val r1:Relation = new Relation val r1:Relation = new Relation
r1.setTarget(pub.getId) r1.setTarget(pub.getId)
r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId)) r1.setSource(affId)
r1.setRelType("resultOrganization") r1.setRelType("resultOrganization")
r1.setRelClass("isAuthorInstitutionOf") r1.setRelClass("isAuthorInstitutionOf")
r1.setSubRelType("affiliation") r1.setSubRelType("affiliation")
r1.setDataInfo(pub.getDataInfo) r1.setDataInfo(pub.getDataInfo)
r1.setCollectedfrom(pub.getCollectedfrom) r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
List(r, r1) List(r, r1)
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
val affiliation = item._2
if (affiliation.GridId.isEmpty) {
val o = new Organization
o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
if (affiliation.DisplayName.nonEmpty)
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
if (affiliation.OfficialPage.isDefined)
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
o
}
else
null
}).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
} }
} }

View File

@ -106,8 +106,10 @@ case object Crossref2Oaf {
// Publisher ( Name of work's publisher mapped into Result/Publisher) // Publisher ( Name of work's publisher mapped into Result/Publisher)
val publisher = (json \ "publisher").extractOrElse[String](null) val publisher = (json \ "publisher").extractOrElse[String](null)
if (publisher!= null && publisher.nonEmpty)
result.setPublisher(asField(publisher)) result.setPublisher(asField(publisher))
// TITLE // TITLE
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", "dnet:dataCite_title") val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", "dnet:dataCite_title")
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title") val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title")
@ -120,7 +122,7 @@ case object Crossref2Oaf {
result.setDescription(descriptionList.asJava) result.setDescription(descriptionList.asJava)
// Source // Source
val sourceList = for {JString(source) <- json \ "source" if source.nonEmpty} yield asField(source) val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
result.setSource(sourceList.asJava) result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping //RELEVANT DATE Mapping
@ -168,7 +170,7 @@ case object Crossref2Oaf {
instance.setRefereed(asField("peerReviewed")) instance.setRefereed(asField("peerReviewed"))
instance.setAccessright(createQualifier("RESTRICTED", "dnet:access_modes")) instance.setAccessright(getRestrictedQualifier())
result.setInstance(List(instance).asJava) result.setInstance(List(instance).asJava)
instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource")) instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
result.setResourcetype(createQualifier(cobjCategory.substring(0, 4),"dnet:dataCite_resource")) result.setResourcetype(createQualifier(cobjCategory.substring(0, 4),"dnet:dataCite_resource"))

View File

@ -115,7 +115,7 @@ case object ConversionUtil {
j.setName(ci.DisplayName.get) j.setName(ci.DisplayName.get)
if (ci.StartDate.isDefined && ci.EndDate.isDefined) if (ci.StartDate.isDefined && ci.EndDate.isDefined)
{ {
j.setConferencedate(s"${ci.StartDate.get.toString} - ${ci.EndDate.get.toString}") j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
} }
publication.setJournal(j) publication.setJournal(j)
@ -214,7 +214,7 @@ case object ConversionUtil {
if (paper.Date != null && paper.Date.isDefined) { if (paper.Date != null && paper.Date.isDefined) {
pub.setDateofacceptance(asField(paper.Date.get.toString)) pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
} }
pub.setPublisher(asField(paper.Publisher)) pub.setPublisher(asField(paper.Publisher))
@ -280,7 +280,7 @@ case object ConversionUtil {
if (paper.Date != null) { if (paper.Date != null) {
pub.setDateofacceptance(asField(paper.Date.toString)) pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
} }
pub.setAuthor(authorsOAF.asJava) pub.setAuthor(authorsOAF.asJava)

View File

@ -27,7 +27,7 @@ object SparkMapUnpayWallToOAF {
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
@ -35,7 +35,8 @@ object SparkMapUnpayWallToOAF {
val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath") val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
logger.info("Converting UnpayWall to OAF") logger.info("Converting UnpayWall to OAF")
val d:Dataset[Publication] = spark.createDataset(inputRDD.repartition(1000).map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication]
val d:Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath) d.write.mode(SaveMode.Overwrite).save(targetPath)
} }

View File

@ -46,7 +46,7 @@ object UnpayWallToOAF {
val i :Instance= new Instance() val i :Instance= new Instance()
i.setCollectedfrom(createUnpayWallCollectedFrom()) i.setCollectedfrom(createUnpayWallCollectedFrom())
i.setAccessright(createQualifier("OPEN", "dnet:access_modes")) i.setAccessright(getOpenAccessQualifier())
i.setUrl(List(oaLocation.url.get).asJava) i.setUrl(List(oaLocation.url.get).asJava)
if (oaLocation.license.isDefined) if (oaLocation.license.isDefined)

View File

@ -4,5 +4,6 @@
{"paramName": "dd", "paramLongName":"dbDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true}, {"paramName": "dd", "paramLongName":"dbDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true}, {"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true}, {"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true} {"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
] ]

View File

@ -1,10 +1,6 @@
[ [
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}, {"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
{"paramName": "cp", "paramLongName":"crossrefPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true}, {"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
{"paramName": "cd", "paramLongName":"crossrefDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
{"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true},
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true}, {"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true}, {"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true} {"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}

View File

@ -1,37 +1,17 @@
<workflow-app name="Create DOIBoostActionSet" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Create DOIBoostActionSet" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>crossrefPublicationPath</name> <name>hostedByMapPath</name>
<description>the Crossref Publication Path</description> <description>the Hosted By Map Path</description>
</property> </property>
<property>
<name>crossrefDatasetPath</name>
<description>the Crossref Dataset Path</description>
</property>
<property>
<name>uwPublicationPath</name>
<description>the UnpayWall Publication Path</description>
</property>
<property>
<name>magPublicationPath</name>
<description>the MAG Publication Path</description>
</property>
<property>
<name>orcidPublicationPath</name>
<description>the ORCID Publication Path</description>
</property>
<property> <property>
<name>affiliationPath</name> <name>affiliationPath</name>
<description>the Affliation Path</description> <description>the Affliation Path</description>
</property> </property>
<property> <property>
<name>paperAffiliationPath</name> <name>paperAffiliationPath</name>
<description>the paperAffiliation Path</description> <description>the paperAffiliation Path</description>
</property> </property>
<property> <property>
<name>workingDirPath</name> <name>workingDirPath</name>
<description>the Working Path</description> <description>the Working Path</description>
@ -52,7 +32,7 @@
<start to="CreateDOIBoost"/> <start to="GenerateActionSet"/>
<kill name="Kill"> <kill name="Kill">
@ -82,17 +62,13 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>--crossrefPublicationPath</arg><arg>${crossrefPublicationPath}</arg> <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
<arg>--crossrefDatasetPath</arg><arg>${crossrefDatasetPath}</arg>
<arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
<arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
<arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
<arg>--affiliationPath</arg><arg>${affiliationPath}</arg> <arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
<arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg> <arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg> <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="End"/> <ok to="GenerateActionSet"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -115,6 +91,7 @@
<arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg> <arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg> <arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg> <arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
<arg>-do</arg><arg>${workingDirPath}/doiBoostOrganization</arg>
<arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg> <arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.doiboost package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType}
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import eu.dnetlib.doiboost.mag.ConversionUtil import eu.dnetlib.doiboost.mag.ConversionUtil
import eu.dnetlib.doiboost.orcid.ORCIDElement import eu.dnetlib.doiboost.orcid.ORCIDElement
@ -15,45 +15,44 @@ import scala.io.Source
class DoiBoostHostedByMapTest { class DoiBoostHostedByMapTest {
@Test
def testLoadMap(): Unit = {
println(DoiBoostMappingUtil.retrieveHostedByMap().keys.size)
} // @Test
// def testMerge():Unit = {
// val conf: SparkConf = new SparkConf()
@Test // val spark: SparkSession =
def testMerge():Unit = { // SparkSession
val conf: SparkConf = new SparkConf() // .builder()
val spark: SparkSession = // .config(conf)
SparkSession // .appName(getClass.getSimpleName)
.builder() // .master("local[*]").getOrCreate()
.config(conf) //
.appName(getClass.getSimpleName) //
.master("local[*]").getOrCreate() //
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
// implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] //
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] //
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) // import spark.implicits._
// val dataset:RDD[String]= spark.sparkContext.textFile("/home/sandro/Downloads/hbMap.gz")
//
import spark.implicits._ //
val dataset:Dataset[ORCIDElement] = spark.read.json("/home/sandro/orcid").as[ORCIDElement] // val hbMap:Dataset[(String, HostedByItemType)] =spark.createDataset(dataset.map(DoiBoostMappingUtil.toHostedByItem))
//
//
dataset.show(false) // hbMap.show()
//
//
//
//
//
//
//
//
} //
//
// }
@Test @Test

View File

@ -1,5 +1,7 @@
package eu.dnetlib.doiboost.mag package eu.dnetlib.doiboost.mag
import java.sql.Timestamp
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
@ -39,6 +41,17 @@ class MAGMappingTest {
@Test
def testDate() :Unit = {
val p:Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
println(p.toString.substring(0,10))
}
@Test @Test
def buildInvertedIndexTest(): Unit = { def buildInvertedIndexTest(): Unit = {
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString