forked from D-Net/dnet-hadoop
implemented generation of ActionSet
This commit is contained in:
parent
2408083566
commit
25f52e19a4
|
@ -1,10 +1,14 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
@ -12,8 +16,12 @@ import scala.io.Source
|
|||
|
||||
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
|
||||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
|
||||
|
||||
object DoiBoostMappingUtil {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
//STATIC STRING
|
||||
val MAG = "microsoft"
|
||||
val MAG_NAME = "Microsoft Academic Graph"
|
||||
|
@ -30,6 +38,31 @@ object DoiBoostMappingUtil {
|
|||
|
||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||
|
||||
def toActionSet(item:Oaf) :(String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: Dataset =>
|
||||
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
|
||||
a.setClazz(classOf[Dataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def retrieveHostedByMap(): Map[String, HostedByItemType] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -52,41 +85,80 @@ object DoiBoostMappingUtil {
|
|||
|
||||
}
|
||||
|
||||
def generateGridAffiliationId(gridId:String) :String = {
|
||||
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||
}
|
||||
|
||||
|
||||
def fixResult(result: Dataset) :Dataset = {
|
||||
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
if (instanceType.isDefined) {
|
||||
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
result.getInstance().asScala.foreach(i => {
|
||||
val hb = new KeyValue
|
||||
hb.setValue("Unknown Repository")
|
||||
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||
i.setHostedby(hb)
|
||||
})
|
||||
result
|
||||
}
|
||||
|
||||
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
|
||||
if (publication.getJournal == null)
|
||||
return publication
|
||||
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
||||
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
|
||||
|
||||
val issn = publication.getJournal.getIssnPrinted
|
||||
val eissn = publication.getJournal.getIssnOnline
|
||||
val lissn = publication.getJournal.getIssnLinking
|
||||
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
|
||||
if (instanceType.isDefined) {
|
||||
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
|
||||
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
|
||||
if (item!= null) {
|
||||
val l = publication.getInstance().asScala.map(i =>{
|
||||
publication.getInstance().asScala.foreach(i => {
|
||||
val hb = new KeyValue
|
||||
hb.setValue (item.officialName)
|
||||
hb.setKey (s"10|${item.id}" )
|
||||
i.setHostedby(hb)
|
||||
if(item.openAccess)
|
||||
if (item != null) {
|
||||
hb.setValue(item.officialName)
|
||||
hb.setKey(generateDSId(item.id))
|
||||
if (item.openAccess)
|
||||
i.setAccessright(createQualifier("Open", "dnet:access_modes"))
|
||||
i
|
||||
}).asJava
|
||||
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
|
||||
}
|
||||
else {
|
||||
hb.setValue("Unknown Repository")
|
||||
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||
}
|
||||
i.setHostedby(hb)
|
||||
})
|
||||
|
||||
publication.setInstance(l)
|
||||
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
|
||||
if (ar.nonEmpty) {
|
||||
if(ar.contains("Open")){
|
||||
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
|
||||
}
|
||||
else {
|
||||
publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
|
||||
}
|
||||
}
|
||||
publication
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo (): DataInfo = {
|
||||
generateDataInfo ("0.9")
|
||||
}
|
||||
def generateDSId(input: String): String = {
|
||||
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|${b}::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
def filterPublication (publication: Publication): Boolean = {
|
||||
def generateDataInfo(): DataInfo = {
|
||||
generateDataInfo("0.9")
|
||||
}
|
||||
|
||||
|
||||
def filterPublication(publication: Publication): Boolean = {
|
||||
|
||||
//Case empty publication
|
||||
if (publication == null)
|
||||
|
@ -97,8 +169,8 @@ def generateDataInfo (): DataInfo = {
|
|||
return false
|
||||
|
||||
|
||||
val s = publication.getTitle.asScala.count (p => p.getValue != null
|
||||
&& p.getValue.nonEmpty && ! p.getValue.equalsIgnoreCase ("[NO TITLE AVAILABLE]") )
|
||||
val s = publication.getTitle.asScala.count(p => p.getValue != null
|
||||
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
|
||||
|
||||
if (s == 0)
|
||||
return false
|
||||
|
@ -106,104 +178,104 @@ def generateDataInfo (): DataInfo = {
|
|||
// fixes #4360 (test publisher)
|
||||
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||
|
||||
if (publisher != null && (publisher.equalsIgnoreCase ("Test accounts") || publisher.equalsIgnoreCase ("CrossRef Test Account") ) ) {
|
||||
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//Publication with no Author
|
||||
if (publication.getAuthor == null || publication.getAuthor.size () == 0)
|
||||
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
||||
return false
|
||||
|
||||
|
||||
//filter invalid author
|
||||
val authors = publication.getAuthor.asScala.map (s => {
|
||||
val authors = publication.getAuthor.asScala.map(s => {
|
||||
if (s.getFullname.nonEmpty) {
|
||||
s.getFullname
|
||||
}
|
||||
}
|
||||
else
|
||||
s"${
|
||||
s.getName
|
||||
} ${
|
||||
} ${
|
||||
s.getSurname
|
||||
}"
|
||||
})
|
||||
}"
|
||||
})
|
||||
|
||||
val c = authors.count (isValidAuthorName)
|
||||
val c = authors.count(isValidAuthorName)
|
||||
if (c == 0)
|
||||
return false
|
||||
|
||||
// fixes #4368
|
||||
if (authors.count (s => s.equalsIgnoreCase ("Addie Jackson") ) > 0 && "Elsevier BV".equalsIgnoreCase (publication.getPublisher.getValue) )
|
||||
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
|
||||
return false
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def isValidAuthorName (fullName: String): Boolean = {
|
||||
def isValidAuthorName(fullName: String): Boolean = {
|
||||
if (fullName == null || fullName.isEmpty)
|
||||
return false
|
||||
if (invalidName.contains (fullName.toLowerCase.trim) )
|
||||
if (invalidName.contains(fullName.toLowerCase.trim))
|
||||
return false
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo (trust: String): DataInfo = {
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference (false)
|
||||
di.setInferred (false)
|
||||
di.setInvisible (false)
|
||||
di.setTrust (trust)
|
||||
di.setProvenanceaction (createQualifier ("sysimport:actionset", "dnet:provenanceActions") )
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||
di
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def createSP (value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier (createQualifier (classId, schemeId) )
|
||||
sp.setValue (value)
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def createSP (value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier (createQualifier (classId, schemeId) )
|
||||
sp.setValue (value)
|
||||
sp.setDataInfo (dataInfo)
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def createCrossrefCollectedFrom (): KeyValue = {
|
||||
def createCrossrefCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue (CROSSREF)
|
||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (CROSSREF.toLowerCase) )
|
||||
cf.setValue(CROSSREF)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def createUnpayWallCollectedFrom (): KeyValue = {
|
||||
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue (UNPAYWALL)
|
||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (UNPAYWALL.toLowerCase) )
|
||||
cf.setValue(UNPAYWALL)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def createORIDCollectedFrom (): KeyValue = {
|
||||
def createORIDCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue (ORCID)
|
||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (ORCID.toLowerCase) )
|
||||
cf.setValue(ORCID)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def generateIdentifier (oaf: Result, doi: String): String = {
|
||||
|
@ -211,51 +283,51 @@ def generateDataInfo (): DataInfo = {
|
|||
if (oaf.isInstanceOf[Dataset] )
|
||||
return s"60|${
|
||||
doiBoostNSPREFIX
|
||||
}${
|
||||
}${
|
||||
SEPARATOR
|
||||
}${
|
||||
}${
|
||||
id
|
||||
}"
|
||||
}"
|
||||
s"50|${
|
||||
doiBoostNSPREFIX
|
||||
}${
|
||||
}${
|
||||
SEPARATOR
|
||||
}${
|
||||
}${
|
||||
id
|
||||
}"
|
||||
}
|
||||
}"
|
||||
}
|
||||
|
||||
|
||||
def createMAGCollectedFrom (): KeyValue = {
|
||||
def createMAGCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue (MAG)
|
||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (MAG) )
|
||||
cf.setValue(MAG_NAME)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
|
||||
cf
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def createQualifier (clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
||||
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
||||
val q = new Qualifier
|
||||
q.setClassid (clsName)
|
||||
q.setClassname (clsValue)
|
||||
q.setSchemeid (schName)
|
||||
q.setSchemename (schValue)
|
||||
q.setClassid(clsName)
|
||||
q.setClassname(clsValue)
|
||||
q.setSchemeid(schName)
|
||||
q.setSchemename(schValue)
|
||||
q
|
||||
}
|
||||
}
|
||||
|
||||
def createQualifier (cls: String, sch: String): Qualifier = {
|
||||
createQualifier (cls, cls, sch, sch)
|
||||
}
|
||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||
createQualifier(cls, cls, sch, sch)
|
||||
}
|
||||
|
||||
|
||||
def asField[T] (value: T): Field[T] = {
|
||||
def asField[T](value: T): Field[T] = {
|
||||
val tmp = new Field[T]
|
||||
tmp.setValue (value)
|
||||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkGenerateDOIBoostActionSet {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
// implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
//
|
||||
// implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
|
||||
//
|
||||
//
|
||||
//
|
||||
// val dbPublicationPath = parser.get("dbPublicationPath")
|
||||
// val dbDatasetPath = parser.get("dbDatasetPath")
|
||||
// val crossRefRelation = parser.get("crossRefRelation")
|
||||
// val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
||||
val workingDirPath = parser.get("targetPath")
|
||||
//
|
||||
// spark.read.load(dbDatasetPath).as[OafDataset]
|
||||
// .map(d =>DoiBoostMappingUtil.fixResult(d))
|
||||
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
//
|
||||
// spark.read.load(dbPublicationPath).as[Publication]
|
||||
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
//
|
||||
// spark.read.load(crossRefRelation).as[Relation]
|
||||
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
//
|
||||
// spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||
|
||||
SequenceFileOutputFormat
|
||||
|
||||
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,11 +1,14 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkGenerateDoiBoost {
|
||||
|
||||
|
@ -22,7 +25,7 @@ object SparkGenerateDoiBoost {
|
|||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
import spark.implicits._
|
||||
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
|
||||
val crossrefDatasetPath = parser.get("crossrefDatasetPath")
|
||||
val uwPublicationPath = parser.get("uwPublicationPath")
|
||||
|
@ -32,15 +35,16 @@ object SparkGenerateDoiBoost {
|
|||
|
||||
|
||||
logger.info("Phase 1) repartition and move all the dataset in a same working folder")
|
||||
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
||||
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
||||
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
||||
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
||||
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
||||
spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
||||
spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
||||
spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
||||
spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
||||
spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||
|
||||
|
@ -73,11 +77,49 @@ object SparkGenerateDoiBoost {
|
|||
|
||||
val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]
|
||||
|
||||
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
val map = DoiBoostMappingUtil.retrieveHostedByMap()
|
||||
|
||||
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
|
||||
|
||||
val affiliationPath = parser.get("affiliationPath")
|
||||
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
||||
|
||||
val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId"))
|
||||
|
||||
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||
|
||||
|
||||
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
|
||||
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation]
|
||||
|
||||
|
||||
|
||||
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
|
||||
|
||||
|
||||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
||||
val pub:Publication = item._1._2
|
||||
val affiliation = item._2
|
||||
val r:Relation = new Relation
|
||||
r.setSource(pub.getId)
|
||||
r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
|
||||
r.setRelType("resultOrganization")
|
||||
r.setRelClass("hasAuthorInstitution")
|
||||
r.setSubRelType("affiliation")
|
||||
r.setDataInfo(pub.getDataInfo)
|
||||
r.setCollectedfrom(pub.getCollectedfrom)
|
||||
val r1:Relation = new Relation
|
||||
r1.setTarget(pub.getId)
|
||||
r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
|
||||
r1.setRelType("resultOrganization")
|
||||
r1.setRelClass("isAuthorInstitutionOf")
|
||||
r1.setSubRelType("affiliation")
|
||||
r1.setDataInfo(pub.getDataInfo)
|
||||
r1.setCollectedfrom(pub.getCollectedfrom)
|
||||
List(r, r1)
|
||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
[
|
||||
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
|
||||
{"paramName": "dp", "paramLongName":"dbPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true},
|
||||
{"paramName": "dd", "paramLongName":"dbDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
|
||||
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
]
|
|
@ -5,5 +5,7 @@
|
|||
{"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||
{"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||
{"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true},
|
||||
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
||||
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
||||
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
]
|
||||
|
|
|
@ -20,6 +20,18 @@
|
|||
<name>orcidPublicationPath</name>
|
||||
<description>the ORCID Publication Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>affiliationPath</name>
|
||||
<description>the Affliation Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>paperAffiliationPath</name>
|
||||
<description>the paperAffiliation Path</description>
|
||||
</property>
|
||||
|
||||
|
||||
<property>
|
||||
<name>workingDirPath</name>
|
||||
<description>the Working Path</description>
|
||||
|
@ -40,7 +52,7 @@
|
|||
|
||||
|
||||
|
||||
<start to="CreateDOIBoost"/>
|
||||
<start to="GenerateActionSet"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
|
@ -75,36 +87,40 @@
|
|||
<arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
|
||||
<arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
|
||||
<arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
|
||||
<arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
|
||||
<arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="GenerateActionSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="GenerateActionSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Generate DOIBoost ActionSet</name>
|
||||
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
|
||||
<arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
|
||||
<arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
|
||||
<arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
|
||||
<arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<!-- <action name="PreprocessMag">-->
|
||||
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||
<!-- <master>yarn-cluster</master>-->
|
||||
<!-- <mode>cluster</mode>-->
|
||||
<!-- <name>Convert Mag to Dataset</name>-->
|
||||
<!-- <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>-->
|
||||
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
|
||||
<!-- <spark-opts>-->
|
||||
<!-- --executor-memory=${sparkExecutorMemory}-->
|
||||
<!-- --executor-cores=${sparkExecutorCores}-->
|
||||
<!-- --driver-memory=${sparkDriverMemory}-->
|
||||
<!-- --conf spark.sql.shuffle.partitions=3840-->
|
||||
<!-- ${sparkExtraOPT}-->
|
||||
<!-- </spark-opts>-->
|
||||
<!-- <arg>--sourcePath</arg><arg>${sourcePath}</arg>-->
|
||||
<!-- <arg>--targetPath</arg><arg>${targetPath}</arg>-->
|
||||
<!-- <arg>--master</arg><arg>yarn-cluster</arg>-->
|
||||
<!-- </spark>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,7 +1,11 @@
|
|||
package eu.dnetlib.dhp.doiboost
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class DoiBoostHostedByMapTest {
|
||||
|
@ -13,4 +17,46 @@ class DoiBoostHostedByMapTest {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testFilter():Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master("local[*]").getOrCreate()
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
|
||||
|
||||
|
||||
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val map = DoiBoostMappingUtil.retrieveHostedByMap()
|
||||
|
||||
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def idDSGeneration():Unit = {
|
||||
val s ="doajarticles::0066-782X"
|
||||
|
||||
|
||||
|
||||
println(DoiBoostMappingUtil.generateDSId(s))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue