forked from D-Net/dnet-hadoop
fixed defintion wf of creation final infospace of scholexplorer
This commit is contained in:
parent
3920c69bc8
commit
8fac10c91e
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.graph
|
package eu.dnetlib.dhp.sx.graph
|
||||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
@ -9,12 +10,7 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
object SparkConvertRDDtoDataset {
|
object SparkConvertRDDtoDataset {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val entities = List(
|
|
||||||
("dataset", classOf[OafDataset]),
|
|
||||||
("otherresearchproduct", classOf[OtherResearchProduct]),
|
|
||||||
("publication", classOf[Publication]),
|
|
||||||
("software", classOf[Software])
|
|
||||||
)
|
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
|
@ -29,15 +25,43 @@ object SparkConvertRDDtoDataset {
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val t = parser.get("targetPath")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $t")
|
||||||
val mapper = new ObjectMapper()
|
|
||||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
val entityPath = s"$t/entities"
|
||||||
|
val relPath = s"$t/relation"
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||||
|
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||||
|
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||||
|
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
||||||
|
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||||
|
|
||||||
|
|
||||||
|
log.info("Converting dataset")
|
||||||
|
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset]))
|
||||||
|
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
|
||||||
|
|
||||||
|
|
||||||
|
log.info("Converting publication")
|
||||||
|
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication]))
|
||||||
|
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
|
||||||
|
|
||||||
|
log.info("Converting software")
|
||||||
|
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software]))
|
||||||
|
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
|
||||||
|
|
||||||
|
log.info("Converting otherresearchproduct")
|
||||||
|
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
|
||||||
|
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
|
||||||
|
|
||||||
|
|
||||||
|
log.info("Converting Relation")
|
||||||
|
|
||||||
|
|
||||||
|
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation]))
|
||||||
|
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||||
|
|
||||||
|
|
||||||
entities.foreach{
|
|
||||||
e =>
|
|
||||||
val rdd =spark.sparkContext.textFile(s"$sourcePath/${e._1}").map(s => mapper.readValue(s, e._2))
|
|
||||||
spark.createDataset(rdd).as[Result].write.mode(SaveMode.Overwrite).save(s"$targetPath/${e._1}")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package eu.dnetlib.dhp.sx.graph
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||||
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
@ -29,11 +29,12 @@ object SparkCreateSummaryObject {
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
|
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
|
||||||
|
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
|
||||||
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
|
|
||||||
|
|
||||||
val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result]
|
val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Oaf].filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
||||||
|
|
||||||
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath)
|
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
|
||||||
|
|
|
@ -57,10 +57,10 @@ object SparkResolveRelation {
|
||||||
currentRelation
|
currentRelation
|
||||||
}.write
|
}.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/resolvedSource")
|
.save(s"$workingPath/relationResolvedSource")
|
||||||
|
|
||||||
|
|
||||||
val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/resolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map{
|
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map{
|
||||||
m =>
|
m =>
|
||||||
val targetResolved = m._2
|
val targetResolved = m._2
|
||||||
|
|
|
@ -35,7 +35,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}/entities</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CreateSummaries"/>
|
<ok to="CreateSummaries"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -87,7 +87,7 @@
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
|
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
|
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
|
||||||
<arg>--relationPath</arg><arg>${sourcePath}/relation_resolved</arg>
|
<arg>--relationPath</arg><arg>${targetPath}/relation</arg>
|
||||||
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="DropJSONPath"/>
|
<ok to="DropJSONPath"/>
|
||||||
|
|
Loading…
Reference in New Issue