forked from D-Net/dnet-hadoop
implemented generation of ActionSet
This commit is contained in:
parent
2408083566
commit
25f52e19a4
|
@ -1,10 +1,14 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Result, StructuredProperty}
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -12,8 +16,12 @@ import scala.io.Source
|
||||||
|
|
||||||
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||||
|
|
||||||
|
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
|
||||||
|
|
||||||
object DoiBoostMappingUtil {
|
object DoiBoostMappingUtil {
|
||||||
|
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
|
||||||
//STATIC STRING
|
//STATIC STRING
|
||||||
val MAG = "microsoft"
|
val MAG = "microsoft"
|
||||||
val MAG_NAME = "Microsoft Academic Graph"
|
val MAG_NAME = "Microsoft Academic Graph"
|
||||||
|
@ -30,6 +38,31 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||||
|
|
||||||
|
def toActionSet(item:Oaf) :(String, String) = {
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
item match {
|
||||||
|
case dataset: Dataset =>
|
||||||
|
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
|
||||||
|
a.setClazz(classOf[Dataset])
|
||||||
|
a.setPayload(dataset)
|
||||||
|
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case publication: Publication =>
|
||||||
|
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||||
|
a.setClazz(classOf[Publication])
|
||||||
|
a.setPayload(publication)
|
||||||
|
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case relation: Relation =>
|
||||||
|
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||||
|
a.setClazz(classOf[Relation])
|
||||||
|
a.setPayload(relation)
|
||||||
|
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case _ =>
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def retrieveHostedByMap(): Map[String, HostedByItemType] = {
|
def retrieveHostedByMap(): Map[String, HostedByItemType] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -52,41 +85,80 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def generateGridAffiliationId(gridId:String) :String = {
|
||||||
|
s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fixResult(result: Dataset) :Dataset = {
|
||||||
|
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||||
|
if (instanceType.isDefined) {
|
||||||
|
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||||
|
}
|
||||||
|
result.getInstance().asScala.foreach(i => {
|
||||||
|
val hb = new KeyValue
|
||||||
|
hb.setValue("Unknown Repository")
|
||||||
|
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||||
|
i.setHostedby(hb)
|
||||||
|
})
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
|
def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
|
||||||
if (publication.getJournal == null)
|
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
||||||
return publication
|
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||||
|
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
|
||||||
|
|
||||||
val issn = publication.getJournal.getIssnPrinted
|
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||||
val eissn = publication.getJournal.getIssnOnline
|
|
||||||
val lissn = publication.getJournal.getIssnLinking
|
if (instanceType.isDefined) {
|
||||||
|
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||||
|
}
|
||||||
|
|
||||||
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
|
val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
|
||||||
if (item!= null) {
|
publication.getInstance().asScala.foreach(i => {
|
||||||
val l = publication.getInstance().asScala.map(i =>{
|
|
||||||
val hb = new KeyValue
|
val hb = new KeyValue
|
||||||
hb.setValue (item.officialName)
|
if (item != null) {
|
||||||
hb.setKey (s"10|${item.id}" )
|
hb.setValue(item.officialName)
|
||||||
i.setHostedby(hb)
|
hb.setKey(generateDSId(item.id))
|
||||||
if(item.openAccess)
|
if (item.openAccess)
|
||||||
i.setAccessright(createQualifier("Open", "dnet:access_modes"))
|
i.setAccessright(createQualifier("Open", "dnet:access_modes"))
|
||||||
i
|
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
|
||||||
}).asJava
|
}
|
||||||
|
else {
|
||||||
|
hb.setValue("Unknown Repository")
|
||||||
|
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||||
|
}
|
||||||
|
i.setHostedby(hb)
|
||||||
|
})
|
||||||
|
|
||||||
publication.setInstance(l)
|
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
|
||||||
|
if (ar.nonEmpty) {
|
||||||
|
if(ar.contains("Open")){
|
||||||
|
publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
publication
|
publication
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo (): DataInfo = {
|
def generateDSId(input: String): String = {
|
||||||
generateDataInfo ("0.9")
|
|
||||||
}
|
val b = StringUtils.substringBefore(input, "::")
|
||||||
|
val a = StringUtils.substringAfter(input, "::")
|
||||||
|
s"10|${b}::${DHPUtils.md5(a)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def filterPublication (publication: Publication): Boolean = {
|
def generateDataInfo(): DataInfo = {
|
||||||
|
generateDataInfo("0.9")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def filterPublication(publication: Publication): Boolean = {
|
||||||
|
|
||||||
//Case empty publication
|
//Case empty publication
|
||||||
if (publication == null)
|
if (publication == null)
|
||||||
|
@ -97,8 +169,8 @@ def generateDataInfo (): DataInfo = {
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
|
||||||
val s = publication.getTitle.asScala.count (p => p.getValue != null
|
val s = publication.getTitle.asScala.count(p => p.getValue != null
|
||||||
&& p.getValue.nonEmpty && ! p.getValue.equalsIgnoreCase ("[NO TITLE AVAILABLE]") )
|
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
|
||||||
|
|
||||||
if (s == 0)
|
if (s == 0)
|
||||||
return false
|
return false
|
||||||
|
@ -106,104 +178,104 @@ def generateDataInfo (): DataInfo = {
|
||||||
// fixes #4360 (test publisher)
|
// fixes #4360 (test publisher)
|
||||||
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||||
|
|
||||||
if (publisher != null && (publisher.equalsIgnoreCase ("Test accounts") || publisher.equalsIgnoreCase ("CrossRef Test Account") ) ) {
|
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Publication with no Author
|
//Publication with no Author
|
||||||
if (publication.getAuthor == null || publication.getAuthor.size () == 0)
|
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
|
||||||
//filter invalid author
|
//filter invalid author
|
||||||
val authors = publication.getAuthor.asScala.map (s => {
|
val authors = publication.getAuthor.asScala.map(s => {
|
||||||
if (s.getFullname.nonEmpty) {
|
if (s.getFullname.nonEmpty) {
|
||||||
s.getFullname
|
s.getFullname
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
s"${
|
s"${
|
||||||
s.getName
|
s.getName
|
||||||
} ${
|
} ${
|
||||||
s.getSurname
|
s.getSurname
|
||||||
}"
|
}"
|
||||||
})
|
})
|
||||||
|
|
||||||
val c = authors.count (isValidAuthorName)
|
val c = authors.count(isValidAuthorName)
|
||||||
if (c == 0)
|
if (c == 0)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
// fixes #4368
|
// fixes #4368
|
||||||
if (authors.count (s => s.equalsIgnoreCase ("Addie Jackson") ) > 0 && "Elsevier BV".equalsIgnoreCase (publication.getPublisher.getValue) )
|
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
|
||||||
return false
|
return false
|
||||||
|
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def isValidAuthorName (fullName: String): Boolean = {
|
def isValidAuthorName(fullName: String): Boolean = {
|
||||||
if (fullName == null || fullName.isEmpty)
|
if (fullName == null || fullName.isEmpty)
|
||||||
return false
|
return false
|
||||||
if (invalidName.contains (fullName.toLowerCase.trim) )
|
if (invalidName.contains(fullName.toLowerCase.trim))
|
||||||
return false
|
return false
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo (trust: String): DataInfo = {
|
def generateDataInfo(trust: String): DataInfo = {
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference (false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred (false)
|
di.setInferred(false)
|
||||||
di.setInvisible (false)
|
di.setInvisible(false)
|
||||||
di.setTrust (trust)
|
di.setTrust(trust)
|
||||||
di.setProvenanceaction (createQualifier ("sysimport:actionset", "dnet:provenanceActions") )
|
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||||
di
|
di
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createSP (value: String, classId: String, schemeId: String): StructuredProperty = {
|
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||||
val sp = new StructuredProperty
|
val sp = new StructuredProperty
|
||||||
sp.setQualifier (createQualifier (classId, schemeId) )
|
sp.setQualifier(createQualifier(classId, schemeId))
|
||||||
sp.setValue (value)
|
sp.setValue(value)
|
||||||
sp
|
sp
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def createSP (value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||||
val sp = new StructuredProperty
|
val sp = new StructuredProperty
|
||||||
sp.setQualifier (createQualifier (classId, schemeId) )
|
sp.setQualifier(createQualifier(classId, schemeId))
|
||||||
sp.setValue (value)
|
sp.setValue(value)
|
||||||
sp.setDataInfo (dataInfo)
|
sp.setDataInfo(dataInfo)
|
||||||
sp
|
sp
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def createCrossrefCollectedFrom (): KeyValue = {
|
def createCrossrefCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
cf.setValue (CROSSREF)
|
cf.setValue(CROSSREF)
|
||||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (CROSSREF.toLowerCase) )
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
|
||||||
cf
|
cf
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createUnpayWallCollectedFrom (): KeyValue = {
|
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
cf.setValue (UNPAYWALL)
|
cf.setValue(UNPAYWALL)
|
||||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (UNPAYWALL.toLowerCase) )
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
|
||||||
cf
|
cf
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def createORIDCollectedFrom (): KeyValue = {
|
def createORIDCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
cf.setValue (ORCID)
|
cf.setValue(ORCID)
|
||||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (ORCID.toLowerCase) )
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
|
||||||
cf
|
cf
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateIdentifier (oaf: Result, doi: String): String = {
|
def generateIdentifier (oaf: Result, doi: String): String = {
|
||||||
|
@ -211,51 +283,51 @@ def generateDataInfo (): DataInfo = {
|
||||||
if (oaf.isInstanceOf[Dataset] )
|
if (oaf.isInstanceOf[Dataset] )
|
||||||
return s"60|${
|
return s"60|${
|
||||||
doiBoostNSPREFIX
|
doiBoostNSPREFIX
|
||||||
}${
|
}${
|
||||||
SEPARATOR
|
SEPARATOR
|
||||||
}${
|
}${
|
||||||
id
|
id
|
||||||
}"
|
}"
|
||||||
s"50|${
|
s"50|${
|
||||||
doiBoostNSPREFIX
|
doiBoostNSPREFIX
|
||||||
}${
|
}${
|
||||||
SEPARATOR
|
SEPARATOR
|
||||||
}${
|
}${
|
||||||
id
|
id
|
||||||
}"
|
}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createMAGCollectedFrom (): KeyValue = {
|
def createMAGCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
cf.setValue (MAG)
|
cf.setValue(MAG_NAME)
|
||||||
cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (MAG) )
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
|
||||||
cf
|
cf
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def createQualifier (clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
||||||
val q = new Qualifier
|
val q = new Qualifier
|
||||||
q.setClassid (clsName)
|
q.setClassid(clsName)
|
||||||
q.setClassname (clsValue)
|
q.setClassname(clsValue)
|
||||||
q.setSchemeid (schName)
|
q.setSchemeid(schName)
|
||||||
q.setSchemename (schValue)
|
q.setSchemename(schValue)
|
||||||
q
|
q
|
||||||
}
|
}
|
||||||
|
|
||||||
def createQualifier (cls: String, sch: String): Qualifier = {
|
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||||
createQualifier (cls, cls, sch, sch)
|
createQualifier(cls, cls, sch, sch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def asField[T] (value: T): Field[T] = {
|
def asField[T](value: T): Field[T] = {
|
||||||
val tmp = new Field[T]
|
val tmp = new Field[T]
|
||||||
tmp.setValue (value)
|
tmp.setValue(value)
|
||||||
tmp
|
tmp
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||||
|
import org.apache.hadoop.io.Text
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
|
import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat}
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
object SparkGenerateDOIBoostActionSet {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
val conf: SparkConf = new SparkConf()
|
||||||
|
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val spark: SparkSession =
|
||||||
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.appName(getClass.getSimpleName)
|
||||||
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
|
// implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
//
|
||||||
|
// implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// val dbPublicationPath = parser.get("dbPublicationPath")
|
||||||
|
// val dbDatasetPath = parser.get("dbDatasetPath")
|
||||||
|
// val crossRefRelation = parser.get("crossRefRelation")
|
||||||
|
// val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
||||||
|
val workingDirPath = parser.get("targetPath")
|
||||||
|
//
|
||||||
|
// spark.read.load(dbDatasetPath).as[OafDataset]
|
||||||
|
// .map(d =>DoiBoostMappingUtil.fixResult(d))
|
||||||
|
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
//
|
||||||
|
// spark.read.load(dbPublicationPath).as[Publication]
|
||||||
|
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
//
|
||||||
|
// spark.read.load(crossRefRelation).as[Relation]
|
||||||
|
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
//
|
||||||
|
// spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||||
|
// .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
|
|
||||||
|
implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
|
||||||
|
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||||
|
|
||||||
|
SequenceFileOutputFormat
|
||||||
|
|
||||||
|
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,11 +1,14 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||||
|
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.functions.col
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkGenerateDoiBoost {
|
object SparkGenerateDoiBoost {
|
||||||
|
|
||||||
|
@ -22,7 +25,7 @@ object SparkGenerateDoiBoost {
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
|
val crossrefPublicationPath = parser.get("crossrefPublicationPath")
|
||||||
val crossrefDatasetPath = parser.get("crossrefDatasetPath")
|
val crossrefDatasetPath = parser.get("crossrefDatasetPath")
|
||||||
val uwPublicationPath = parser.get("uwPublicationPath")
|
val uwPublicationPath = parser.get("uwPublicationPath")
|
||||||
|
@ -32,15 +35,16 @@ object SparkGenerateDoiBoost {
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 1) repartition and move all the dataset in a same working folder")
|
logger.info("Phase 1) repartition and move all the dataset in a same working folder")
|
||||||
// spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
|
||||||
// spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
|
||||||
// spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
|
||||||
// spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
|
||||||
// spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
|
||||||
|
|
||||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||||
|
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
|
||||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||||
|
|
||||||
|
@ -73,11 +77,49 @@ object SparkGenerateDoiBoost {
|
||||||
|
|
||||||
val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]
|
val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]
|
||||||
|
|
||||||
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
val map = DoiBoostMappingUtil.retrieveHostedByMap()
|
||||||
|
|
||||||
|
doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||||
|
|
||||||
|
|
||||||
|
val affiliationPath = parser.get("affiliationPath")
|
||||||
|
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
||||||
|
|
||||||
|
val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId"))
|
||||||
|
|
||||||
|
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||||
|
|
||||||
|
|
||||||
|
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
|
||||||
|
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
|
||||||
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
|
||||||
|
|
||||||
|
|
||||||
|
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
||||||
|
val pub:Publication = item._1._2
|
||||||
|
val affiliation = item._2
|
||||||
|
val r:Relation = new Relation
|
||||||
|
r.setSource(pub.getId)
|
||||||
|
r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
|
||||||
|
r.setRelType("resultOrganization")
|
||||||
|
r.setRelClass("hasAuthorInstitution")
|
||||||
|
r.setSubRelType("affiliation")
|
||||||
|
r.setDataInfo(pub.getDataInfo)
|
||||||
|
r.setCollectedfrom(pub.getCollectedfrom)
|
||||||
|
val r1:Relation = new Relation
|
||||||
|
r1.setTarget(pub.getId)
|
||||||
|
r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
|
||||||
|
r1.setRelType("resultOrganization")
|
||||||
|
r1.setRelClass("isAuthorInstitutionOf")
|
||||||
|
r1.setSubRelType("affiliation")
|
||||||
|
r1.setDataInfo(pub.getDataInfo)
|
||||||
|
r1.setCollectedfrom(pub.getCollectedfrom)
|
||||||
|
List(r, r1)
|
||||||
|
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
[
|
||||||
|
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
|
||||||
|
{"paramName": "dp", "paramLongName":"dbPublicationPath", "paramDescription": "the Crossref Publication Path", "paramRequired": true},
|
||||||
|
{"paramName": "dd", "paramLongName":"dbDatasetPath", "paramDescription": "the Crossref Dataset Path", "paramRequired": true},
|
||||||
|
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||||
|
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||||
|
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||||
|
]
|
|
@ -5,5 +5,7 @@
|
||||||
{"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
{"paramName": "up", "paramLongName":"uwPublicationPath", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||||
{"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
{"paramName": "mp", "paramLongName":"magPublicationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||||
{"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true},
|
{"paramName": "op", "paramLongName":"orcidPublicationPath", "paramDescription": "the ORCID Publication Path", "paramRequired": true},
|
||||||
|
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
||||||
|
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
||||||
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
|
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||||
]
|
]
|
||||||
|
|
|
@ -20,6 +20,18 @@
|
||||||
<name>orcidPublicationPath</name>
|
<name>orcidPublicationPath</name>
|
||||||
<description>the ORCID Publication Path</description>
|
<description>the ORCID Publication Path</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>affiliationPath</name>
|
||||||
|
<description>the Affliation Path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>paperAffiliationPath</name>
|
||||||
|
<description>the paperAffiliation Path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>workingDirPath</name>
|
<name>workingDirPath</name>
|
||||||
<description>the Working Path</description>
|
<description>the Working Path</description>
|
||||||
|
@ -40,7 +52,7 @@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<start to="CreateDOIBoost"/>
|
<start to="GenerateActionSet"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -75,36 +87,40 @@
|
||||||
<arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
|
<arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
|
||||||
<arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
|
<arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
|
||||||
<arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
|
<arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
|
||||||
|
<arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
|
||||||
|
<arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
|
||||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="GenerateActionSet"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="GenerateActionSet">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Generate DOIBoost ActionSet</name>
|
||||||
|
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
|
||||||
|
<arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
|
||||||
|
<arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
|
||||||
|
<arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!-- <action name="PreprocessMag">-->
|
|
||||||
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
|
||||||
<!-- <master>yarn-cluster</master>-->
|
|
||||||
<!-- <mode>cluster</mode>-->
|
|
||||||
<!-- <name>Convert Mag to Dataset</name>-->
|
|
||||||
<!-- <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>-->
|
|
||||||
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
|
|
||||||
<!-- <spark-opts>-->
|
|
||||||
<!-- --executor-memory=${sparkExecutorMemory}-->
|
|
||||||
<!-- --executor-cores=${sparkExecutorCores}-->
|
|
||||||
<!-- --driver-memory=${sparkDriverMemory}-->
|
|
||||||
<!-- --conf spark.sql.shuffle.partitions=3840-->
|
|
||||||
<!-- ${sparkExtraOPT}-->
|
|
||||||
<!-- </spark-opts>-->
|
|
||||||
<!-- <arg>--sourcePath</arg><arg>${sourcePath}</arg>-->
|
|
||||||
<!-- <arg>--targetPath</arg><arg>${targetPath}</arg>-->
|
|
||||||
<!-- <arg>--master</arg><arg>yarn-cluster</arg>-->
|
|
||||||
<!-- </spark>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,7 +1,11 @@
|
||||||
package eu.dnetlib.dhp.doiboost
|
package eu.dnetlib.dhp.doiboost
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
|
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
class DoiBoostHostedByMapTest {
|
class DoiBoostHostedByMapTest {
|
||||||
|
@ -13,4 +17,46 @@ class DoiBoostHostedByMapTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testFilter():Unit = {
|
||||||
|
val conf: SparkConf = new SparkConf()
|
||||||
|
val spark: SparkSession =
|
||||||
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.appName(getClass.getSimpleName)
|
||||||
|
.master("local[*]").getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
|
||||||
|
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
val map = DoiBoostMappingUtil.retrieveHostedByMap()
|
||||||
|
|
||||||
|
println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def idDSGeneration():Unit = {
|
||||||
|
val s ="doajarticles::0066-782X"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
println(DoiBoostMappingUtil.generateDSId(s))
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue