forked from D-Net/dnet-hadoop
added first implementation of Pangaea Mapping
This commit is contained in:
parent
c25238480c
commit
7f8848ecdd
|
@ -73,7 +73,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
||||||
.as(Encoders.bean(Relation.class));
|
.as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
//<mergedObjectID, dedupID>
|
// <mergedObjectID, dedupID>
|
||||||
Dataset<Tuple2<String, String>> mergedIds = mergeRels
|
Dataset<Tuple2<String, String>> mergedIds = mergeRels
|
||||||
.where(col("relClass").equalTo(ModelConstants.MERGES))
|
.where(col("relClass").equalTo(ModelConstants.MERGES))
|
||||||
.select(col("source"), col("target"))
|
.select(col("source"), col("target"))
|
||||||
|
@ -116,31 +116,32 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
.map((MapFunction<Tuple2<String, Relation>, Relation>) t -> t._2(), Encoders.bean(Relation.class));
|
.map((MapFunction<Tuple2<String, Relation>, Relation>) t -> t._2(), Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
//redirect the relations to the dedupID
|
// redirect the relations to the dedupID
|
||||||
private static Dataset<Relation> createNewRels(
|
private static Dataset<Relation> createNewRels(
|
||||||
Dataset<Relation> rels, //all the relations to be redirected
|
Dataset<Relation> rels, // all the relations to be redirected
|
||||||
Dataset<Tuple2<String, String>> mergedIds, //merge rels: <mergedObjectID, dedupID>
|
Dataset<Tuple2<String, String>> mergedIds, // merge rels: <mergedObjectID, dedupID>
|
||||||
MapFunction<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>, Relation> mapRel) {
|
MapFunction<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>, Relation> mapRel) {
|
||||||
|
|
||||||
//<sourceID, relation, targetID>
|
// <sourceID, relation, targetID>
|
||||||
Dataset<Tuple3<String, Relation, String>> mapped = rels
|
Dataset<Tuple3<String, Relation, String>> mapped = rels
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Relation, Tuple3<String, Relation, String>>) r -> new Tuple3<>(getId(r, FieldType.SOURCE),
|
(MapFunction<Relation, Tuple3<String, Relation, String>>) r -> new Tuple3<>(getId(r, FieldType.SOURCE),
|
||||||
r, getId(r, FieldType.TARGET)),
|
r, getId(r, FieldType.TARGET)),
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class), Encoders.STRING()));
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class), Encoders.STRING()));
|
||||||
|
|
||||||
//< <sourceID, relation, target>, <sourceID, dedupID> >
|
// < <sourceID, relation, target>, <sourceID, dedupID> >
|
||||||
Dataset<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>> relSource = mapped
|
Dataset<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>> relSource = mapped
|
||||||
.joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer");
|
.joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer");
|
||||||
|
|
||||||
//< <<sourceID, relation, targetID>, <sourceID, dedupID>>, <targetID, dedupID> >
|
// < <<sourceID, relation, targetID>, <sourceID, dedupID>>, <targetID, dedupID> >
|
||||||
Dataset<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>> relSourceTarget = relSource
|
Dataset<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>> relSourceTarget = relSource
|
||||||
.joinWith(mergedIds, relSource.col("_1._3").equalTo(mergedIds.col("_1")), "left_outer");
|
.joinWith(mergedIds, relSource.col("_1._3").equalTo(mergedIds.col("_1")), "left_outer");
|
||||||
|
|
||||||
return relSourceTarget
|
return relSourceTarget
|
||||||
.filter(
|
.filter(
|
||||||
(FilterFunction<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>>)
|
(FilterFunction<Tuple2<Tuple2<Tuple3<String, Relation, String>, Tuple2<String, String>>, Tuple2<String, String>>>) r -> r
|
||||||
r -> r._1()._1() != null || r._2() != null)
|
._1()
|
||||||
|
._1() != null || r._2() != null)
|
||||||
.map(mapRel, Encoders.bean(Relation.class))
|
.map(mapRel, Encoders.bean(Relation.class))
|
||||||
.distinct();
|
.distinct();
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
package eu.dnetlib.sx.pangaea
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders}
|
||||||
|
import org.json4s
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
import java.text.SimpleDateFormat
|
||||||
|
import java.util.Date
|
||||||
|
|
||||||
|
|
||||||
|
case class PangaeaDataModel(datestamp:String, identifier:String, xml:String) {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
object PangaeaUtils {
|
||||||
|
|
||||||
|
|
||||||
|
def toDataset(input:String):PangaeaDataModel = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
|
||||||
|
val d = new Date()
|
||||||
|
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
|
||||||
|
|
||||||
|
val ds = (json \ "internal-datestamp").extractOrElse[String](s)
|
||||||
|
val identifier= (json \ "metadatalink").extractOrElse[String]()
|
||||||
|
val xml= (json \ "xml").extract[String]
|
||||||
|
PangaeaDataModel(ds, identifier,xml)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
|
||||||
|
|
||||||
|
|
||||||
|
override def zero: PangaeaDataModel = null
|
||||||
|
|
||||||
|
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
|
||||||
|
if (b == null)
|
||||||
|
a._2
|
||||||
|
else {
|
||||||
|
if (a == null)
|
||||||
|
b
|
||||||
|
else {
|
||||||
|
val ts1 = b.datestamp
|
||||||
|
val ts2 = a._2.datestamp
|
||||||
|
if (ts1 > ts2)
|
||||||
|
b
|
||||||
|
else
|
||||||
|
a._2
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
|
||||||
|
if (b1 == null)
|
||||||
|
b2
|
||||||
|
else {
|
||||||
|
if (b2 == null)
|
||||||
|
b1
|
||||||
|
else {
|
||||||
|
val ts1 = b1.datestamp
|
||||||
|
val ts2 = b2.datestamp
|
||||||
|
if (ts1 > ts2)
|
||||||
|
b1
|
||||||
|
else
|
||||||
|
b2
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
package eu.dnetlib.sx.pangaea
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
object SparkGeneratePanagaeaDataset {
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
val conf: SparkConf = new SparkConf()
|
||||||
|
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString)
|
||||||
|
parser.parseArgument(args)
|
||||||
|
|
||||||
|
|
||||||
|
val spark: SparkSession =
|
||||||
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||||
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
||||||
|
logger.info("Converting sequential file into Dataset")
|
||||||
|
val sc:SparkContext = spark.sparkContext
|
||||||
|
|
||||||
|
val workingPath:String = parser.get("workingPath")
|
||||||
|
|
||||||
|
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||||
|
|
||||||
|
val inputRDD:RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
||||||
|
|
||||||
|
spark.createDataset(inputRDD).as[PangaeaDataModel]
|
||||||
|
.map(s => (s.identifier,s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
||||||
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
|
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
||||||
|
.map(s => s._2)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset_updated")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</configuration>
|
|
@ -0,0 +1,40 @@
|
||||||
|
<workflow-app name="Transform_Pangaea_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>pangaeaWorkingPath</name>
|
||||||
|
<description>the Pangaea Working Path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ConvertDataset"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ConvertDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert Pangaea to Dataset</name>
|
||||||
|
<class>eu.dnetlib.sx.pangaea.SparkGeneratePanagaeaDataset</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--workingPath</arg><arg>${pangaeaWorkingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,4 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}
|
||||||
|
]
|
|
@ -0,0 +1,25 @@
|
||||||
|
package eu.dnetlib.dhp.sx.pangaea
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import java.util.TimeZone
|
||||||
|
import java.text.SimpleDateFormat
|
||||||
|
import java.util.Date
|
||||||
|
class PangaeaTransformTest {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def test_dateStamp() :Unit ={
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val d = new Date()
|
||||||
|
|
||||||
|
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
|
||||||
|
|
||||||
|
|
||||||
|
println(s)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue