implemented generation of ActionSet

This commit is contained in:
Sandro La Bruzzo 2020-05-26 09:15:33 +02:00
parent 2408083566
commit 25f52e19a4
8 changed files with 484 additions and 198 deletions

View File

@ -1,10 +1,14 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Result, StructuredProperty} import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
@ -12,8 +16,12 @@ import scala.io.Source
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
object DoiBoostMappingUtil { object DoiBoostMappingUtil {
val logger: Logger = LoggerFactory.getLogger(getClass)
//STATIC STRING //STATIC STRING
val MAG = "microsoft" val MAG = "microsoft"
val MAG_NAME = "Microsoft Academic Graph" val MAG_NAME = "Microsoft Academic Graph"
@ -30,6 +38,31 @@ object DoiBoostMappingUtil {
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;") val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
def toActionSet(item:Oaf) :(String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def retrieveHostedByMap(): Map[String, HostedByItemType] = { def retrieveHostedByMap(): Map[String, HostedByItemType] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -52,210 +85,249 @@ object DoiBoostMappingUtil {
} }
def generateGridAffiliationId(gridId:String) :String = {
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}
def fixResult(result: Dataset) :Dataset = {
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
result.getInstance().asScala.foreach(i => {
val hb = new KeyValue
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
i.setHostedby(hb)
})
result
}
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = { def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
if (publication.getJournal == null) val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
return publication val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
val issn = publication.getJournal.getIssnPrinted val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
val eissn = publication.getJournal.getIssnOnline
val lissn = publication.getJournal.getIssnLinking if (instanceType.isDefined) {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap) val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
if (item!= null) { publication.getInstance().asScala.foreach(i => {
val l = publication.getInstance().asScala.map(i =>{ val hb = new KeyValue
val hb = new KeyValue if (item != null) {
hb.setValue (item.officialName) hb.setValue(item.officialName)
hb.setKey (s"10|${item.id}" ) hb.setKey(generateDSId(item.id))
i.setHostedby(hb) if (item.openAccess)
if(item.openAccess)
i.setAccessright(createQualifier("Open", "dnet:access_modes")) i.setAccessright(createQualifier("Open", "dnet:access_modes"))
i publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
}).asJava }
else {
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
}
i.setHostedby(hb)
})
publication.setInstance(l) val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
if (ar.nonEmpty) {
if(ar.contains("Open")){
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
}
else {
publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
}
} }
publication publication
} }
def generateDataInfo (): DataInfo = { def generateDSId(input: String): String = {
generateDataInfo ("0.9")
} val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|${b}::${DHPUtils.md5(a)}"
}
def filterPublication (publication: Publication): Boolean = { def generateDataInfo(): DataInfo = {
generateDataInfo("0.9")
//Case empty publication }
if (publication == null)
return false
//Case publication with no title
if (publication.getTitle == null || publication.getTitle.size == 0)
return false
val s = publication.getTitle.asScala.count (p => p.getValue != null def filterPublication(publication: Publication): Boolean = {
&& p.getValue.nonEmpty && ! p.getValue.equalsIgnoreCase ("[NO TITLE AVAILABLE]") )
if (s == 0) //Case empty publication
return false if (publication == null)
return false
// fixes #4360 (test publisher) //Case publication with no title
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null if (publication.getTitle == null || publication.getTitle.size == 0)
return false
if (publisher != null && (publisher.equalsIgnoreCase ("Test accounts") || publisher.equalsIgnoreCase ("CrossRef Test Account") ) ) {
return false;
}
//Publication with no Author
if (publication.getAuthor == null || publication.getAuthor.size () == 0)
return false
//filter invalid author val s = publication.getTitle.asScala.count(p => p.getValue != null
val authors = publication.getAuthor.asScala.map (s => { && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
if (s.getFullname.nonEmpty) {
s.getFullname
}
else
s"${
s.getName
} ${
s.getSurname
}"
})
val c = authors.count (isValidAuthorName) if (s == 0)
if (c == 0) return false
return false
// fixes #4368 // fixes #4360 (test publisher)
if (authors.count (s => s.equalsIgnoreCase ("Addie Jackson") ) > 0 && "Elsevier BV".equalsIgnoreCase (publication.getPublisher.getValue) ) val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
return false
true if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
} return false;
}
//Publication with no Author
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false
def isValidAuthorName (fullName: String): Boolean = { //filter invalid author
if (fullName == null || fullName.isEmpty) val authors = publication.getAuthor.asScala.map(s => {
return false if (s.getFullname.nonEmpty) {
if (invalidName.contains (fullName.toLowerCase.trim) ) s.getFullname
return false }
true else
} s"${
s.getName
} ${
s.getSurname
}"
})
val c = authors.count(isValidAuthorName)
if (c == 0)
return false
// fixes #4368
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
return false
true
}
def generateDataInfo (trust: String): DataInfo = { def isValidAuthorName(fullName: String): Boolean = {
val di = new DataInfo if (fullName == null || fullName.isEmpty)
di.setDeletedbyinference (false) return false
di.setInferred (false) if (invalidName.contains(fullName.toLowerCase.trim))
di.setInvisible (false) return false
di.setTrust (trust) true
di.setProvenanceaction (createQualifier ("sysimport:actionset", "dnet:provenanceActions") ) }
di
}
def createSP (value: String, classId: String, schemeId: String): StructuredProperty = { def generateDataInfo(trust: String): DataInfo = {
val sp = new StructuredProperty val di = new DataInfo
sp.setQualifier (createQualifier (classId, schemeId) ) di.setDeletedbyinference(false)
sp.setValue (value) di.setInferred(false)
sp di.setInvisible(false)
di.setTrust(trust)
} di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
di
def createSP (value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { }
val sp = new StructuredProperty
sp.setQualifier (createQualifier (classId, schemeId) )
sp.setValue (value)
sp.setDataInfo (dataInfo)
sp
}
def createCrossrefCollectedFrom (): KeyValue = {
val cf = new KeyValue
cf.setValue (CROSSREF)
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (CROSSREF.toLowerCase) )
cf
}
def createUnpayWallCollectedFrom (): KeyValue = { def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp
val cf = new KeyValue }
cf.setValue (UNPAYWALL)
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (UNPAYWALL.toLowerCase) )
cf
} def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
def createORIDCollectedFrom (): KeyValue = { }
val cf = new KeyValue def createCrossrefCollectedFrom(): KeyValue = {
cf.setValue (ORCID)
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (ORCID.toLowerCase) )
cf
} val cf = new KeyValue
cf.setValue(CROSSREF)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
cf
}
def createUnpayWallCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(UNPAYWALL)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
cf
}
def createORIDCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(ORCID)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
cf
}
def generateIdentifier (oaf: Result, doi: String): String = { def generateIdentifier (oaf: Result, doi: String): String = {
val id = DHPUtils.md5 (doi.toLowerCase) val id = DHPUtils.md5 (doi.toLowerCase)
if (oaf.isInstanceOf[Dataset] ) if (oaf.isInstanceOf[Dataset] )
return s"60|${ return s"60|${
doiBoostNSPREFIX doiBoostNSPREFIX
}${ }${
SEPARATOR SEPARATOR
}${ }${
id id
}" }"
s"50|${ s"50|${
doiBoostNSPREFIX doiBoostNSPREFIX
}${ }${
SEPARATOR SEPARATOR
}${ }${
id id
}" }"
} }
def createMAGCollectedFrom (): KeyValue = { def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue val cf = new KeyValue
cf.setValue (MAG) cf.setValue(MAG_NAME)
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (MAG) ) cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
cf cf
} }
def createQualifier (clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = { def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
val q = new Qualifier val q = new Qualifier
q.setClassid (clsName) q.setClassid(clsName)
q.setClassname (clsValue) q.setClassname(clsValue)
q.setSchemeid (schName) q.setSchemeid(schName)
q.setSchemename (schValue) q.setSchemename(schValue)
q q
} }
def createQualifier (cls: String, sch: String): Qualifier = { def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier (cls, cls, sch, sch) createQualifier(cls, cls, sch, sch)
} }
def asField[T] (value: T): Field[T] = { def asField[T](value: T): Field[T] = {
val tmp = new Field[T] val tmp = new Field[T]
tmp.setValue (value) tmp.setValue(value)
tmp tmp
} }
} }

View File

@ -0,0 +1,80 @@
package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import org.apache.hadoop.io.Text
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet {
val logger: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
// implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
//
// implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
//
//
//
// val dbPublicationPath = parser.get("dbPublicationPath")
// val dbDatasetPath = parser.get("dbDatasetPath")
// val crossRefRelation = parser.get("crossRefRelation")
// val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
val workingDirPath = parser.get("targetPath")
//
// spark.read.load(dbDatasetPath).as[OafDataset]
// .map(d =>DoiBoostMappingUtil.fixResult(d))
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
//
// spark.read.load(dbPublicationPath).as[Publication]
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
//
// spark.read.load(crossRefRelation).as[Relation]
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
//
// spark.read.load(dbaffiliationRelationPath).as[Relation]
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
SequenceFileOutputFormat
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -1,11 +1,14 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkGenerateDoiBoost { object SparkGenerateDoiBoost {
@ -22,7 +25,7 @@ object SparkGenerateDoiBoost {
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
import spark.implicits._
val crossrefPublicationPath = parser.get("crossrefPublicationPath") val crossrefPublicationPath = parser.get("crossrefPublicationPath")
val crossrefDatasetPath = parser.get("crossrefDatasetPath") val crossrefDatasetPath = parser.get("crossrefDatasetPath")
val uwPublicationPath = parser.get("uwPublicationPath") val uwPublicationPath = parser.get("uwPublicationPath")
@ -32,15 +35,16 @@ object SparkGenerateDoiBoost {
logger.info("Phase 1) repartition and move all the dataset in a same working folder") logger.info("Phase 1) repartition and move all the dataset in a same working folder")
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication") spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset") spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication") spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication") spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication") spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
logger.info("Phase 2) Join Crossref with UnpayWall") logger.info("Phase 2) Join Crossref with UnpayWall")
@ -73,11 +77,49 @@ object SparkGenerateDoiBoost {
val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication] val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered") val map = DoiBoostMappingUtil.retrieveHostedByMap()
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath")
val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId"))
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation]
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub:Publication = item._1._2
val affiliation = item._2
val r:Relation = new Relation
r.setSource(pub.getId)
r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
r.setRelType("resultOrganization")
r.setRelClass("hasAuthorInstitution")
r.setSubRelType("affiliation")
r.setDataInfo(pub.getDataInfo)
r.setCollectedfrom(pub.getCollectedfrom)
val r1:Relation = new Relation
r1.setTarget(pub.getId)
r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
r1.setRelType("resultOrganization")
r1.setRelClass("isAuthorInstitutionOf")
r1.setSubRelType("affiliation")
r1.setDataInfo(pub.getDataInfo)
r1.setCollectedfrom(pub.getCollectedfrom)
List(r, r1)
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
} }
} }

View File

@ -0,0 +1,8 @@
[
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
{"paramName": "dp", "paramLongName":"dbPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true},
{"paramName": "dd", "paramLongName":"dbDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
]

View File

@ -1,9 +1,11 @@
[ [
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}, {"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
{"paramName": "cp", "paramLongName":"crossrefPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true}, {"paramName": "cp", "paramLongName":"crossrefPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true},
{"paramName": "cd", "paramLongName":"crossrefDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true}, {"paramName": "cd", "paramLongName":"crossrefDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
{"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true}, {"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true}, {"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true}, {"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true} {"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
] ]

View File

@ -20,6 +20,18 @@
<name>orcidPublicationPath</name> <name>orcidPublicationPath</name>
<description>the ORCID Publication Path</description> <description>the ORCID Publication Path</description>
</property> </property>
<property>
<name>affiliationPath</name>
<description>the Affliation Path</description>
</property>
<property>
<name>paperAffiliationPath</name>
<description>the paperAffiliation Path</description>
</property>
<property> <property>
<name>workingDirPath</name> <name>workingDirPath</name>
<description>the Working Path</description> <description>the Working Path</description>
@ -40,7 +52,7 @@
<start to="CreateDOIBoost"/> <start to="GenerateActionSet"/>
<kill name="Kill"> <kill name="Kill">
@ -75,36 +87,40 @@
<arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg> <arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
<arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg> <arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
<arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg> <arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
<arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
<arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg> <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="GenerateActionSet"/>
<error to="Kill"/>
</action>
<action name="GenerateActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Generate DOIBoost ActionSet</name>
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
<arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
<arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="PreprocessMag">-->
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<!-- <master>yarn-cluster</master>-->
<!-- <mode>cluster</mode>-->
<!-- <name>Convert Mag to Dataset</name>-->
<!-- <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>-->
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
<!-- <spark-opts>-->
<!-- &#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!-- &#45;&#45;executor-cores=${sparkExecutorCores}-->
<!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
<!-- &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!-- ${sparkExtraOPT}-->
<!-- </spark-opts>-->
<!-- <arg>&#45;&#45;sourcePath</arg><arg>${sourcePath}</arg>-->
<!-- <arg>&#45;&#45;targetPath</arg><arg>${targetPath}</arg>-->
<!-- <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
<!-- </spark>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,7 +1,11 @@
package eu.dnetlib.dhp.doiboost package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
class DoiBoostHostedByMapTest { class DoiBoostHostedByMapTest {
@ -13,4 +17,46 @@ class DoiBoostHostedByMapTest {
} }
@Test
def testFilter():Unit = {
val conf: SparkConf = new SparkConf()
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master("local[*]").getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
val mapper = new ObjectMapper()
val map = DoiBoostMappingUtil.retrieveHostedByMap()
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
}
@Test
def idDSGeneration():Unit = {
val s ="doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
} }

File diff suppressed because one or more lines are too long