forked from D-Net/dnet-hadoop
added scholix generation
This commit is contained in:
parent
4c54bd8742
commit
8535506c22
|
@ -4,9 +4,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||||
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
|
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
|
||||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||||
|
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkCreateScholix {
|
object SparkCreateScholix {
|
||||||
|
@ -31,40 +32,45 @@ object SparkCreateScholix {
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val relEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
implicit val summaryEncoder :Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
implicit val scholixEncoder :Encoder[Scholix] = Encoders.kryo[Scholix]
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
val relationDS:Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
|
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
|
||||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
|
|
||||||
val summaryDS:Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
|
|
||||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
|
||||||
|
|
||||||
|
|
||||||
val res: Array[((String, Relation), (String, ScholixSummary))] =relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left").take(10)
|
|
||||||
|
|
||||||
|
|
||||||
res.foreach(r =>println(r._1._2))
|
|
||||||
|
|
||||||
// relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
|
||||||
// .map {input:((String,Relation), (String, ScholixSummary)) =>
|
|
||||||
// val rel:Relation = input._1._2
|
|
||||||
// val source:ScholixSummary = input._2._2
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// val s = new Scholix
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
|
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
|
||||||
|
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
||||||
|
|
||||||
|
|
||||||
|
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
||||||
|
.map { input: ((String, Relation), (String, ScholixSummary)) =>
|
||||||
|
val rel: Relation = input._1._2
|
||||||
|
val source: ScholixSummary = input._2._2
|
||||||
|
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
|
||||||
|
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
|
||||||
|
|
||||||
|
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||||
|
|
||||||
|
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
||||||
|
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
|
||||||
|
val s: Scholix = input._1._2
|
||||||
|
val target: ScholixSummary = input._2._2
|
||||||
|
ScholixUtils.generateCompleteScholix(s, target)
|
||||||
|
}.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
|
||||||
|
|
||||||
|
|
||||||
|
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
||||||
|
|
||||||
|
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).groupByKey(_.getIdentifier).reduceGroups { (x, y) =>
|
||||||
|
if (x != null)
|
||||||
|
x
|
||||||
|
else
|
||||||
|
y
|
||||||
|
}.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.sx.graph.scholix
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, Relation, Result, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Dataset, Relation, Result, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource}
|
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource}
|
||||||
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -45,13 +46,35 @@ object ScholixUtils {
|
||||||
|
|
||||||
def extractRelationDate(summary: ScholixSummary):String = {
|
def extractRelationDate(summary: ScholixSummary):String = {
|
||||||
|
|
||||||
if(summary.getDate== null && !summary.getDate.isEmpty)
|
if(summary.getDate== null || summary.getDate.isEmpty)
|
||||||
null
|
null
|
||||||
else {
|
else {
|
||||||
summary.getDate.get(0)
|
summary.getDate.get(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = {
|
||||||
|
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def createInverseScholixRelation(scholix: Scholix):Scholix = {
|
||||||
|
val s = new Scholix
|
||||||
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
s.setPublisher(scholix.getPublisher)
|
||||||
|
s.setLinkprovider(scholix.getLinkprovider)
|
||||||
|
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||||
|
s.setSource(scholix.getTarget)
|
||||||
|
s.setTarget(scholix.getSource)
|
||||||
|
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
||||||
|
s
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,6 +101,19 @@ object ScholixUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = {
|
||||||
|
val s = new Scholix
|
||||||
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
s.setPublisher(scholix.getPublisher)
|
||||||
|
s.setLinkprovider(scholix.getLinkprovider)
|
||||||
|
s.setRelationship(scholix.getRelationship)
|
||||||
|
s.setSource(scholix.getSource)
|
||||||
|
s.setTarget(generateScholixResourceFromSummary(target))
|
||||||
|
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||||
val r = new ScholixResource
|
val r = new ScholixResource
|
||||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
|
{"paramName":"r", "paramLongName":"relationPath", "paramDescription": "the relation resolved Path", "paramRequired": true},
|
||||||
|
{"paramName":"s", "paramLongName":"summaryPath", "paramDescription": "the summary Path", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target base path of the scholix", "paramRequired": true}
|
||||||
|
]
|
|
@ -10,7 +10,7 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="CreateSummaries"/>
|
<start to="CreateScholix"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -90,6 +90,33 @@
|
||||||
<arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
|
<arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
|
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="CreateScholix"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CreateScholix">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Generate Scholix Dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.sx.graph.SparkCreateScholix</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.shuffle.partitions=6000
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
|
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
|
||||||
|
<arg>--relationPath</arg><arg>${targetPath}/resolved/resolvedRelation</arg>
|
||||||
|
|
||||||
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue