Merge pull request 'implemented oozie workflow to generate scholix dump filtering relclass semantic' (#229) from opencitation_enrichments into beta

Reviewed-on: #229
2022-07-21 10:12:17 +02:00 · 2022-07-21 10:12:17 +02:00 · d900a02b74
parent 18b505d6a3 5f651f2316
commit d900a02b74
6 changed files with 226 additions and 26 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json
@ -2,4 +2,5 @@
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
+  {"paramName":"r",   "paramLongName":"filterRelation", "paramDescription": "the relation to filter", "paramRequired": false}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json
@ -3,5 +3,7 @@
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",                        "paramRequired": true},
  {"paramName":"su",   "paramLongName":"scholixUpdatePath", "paramDescription": "the scholix updated Path",       "paramRequired": false},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the raw graph",              "paramRequired": true},
-  {"paramName":"o",   "paramLongName":"objectType", "paramDescription": "should be scholix or Summary",           "paramRequired": true}
+  {"paramName":"o",   "paramLongName":"objectType", "paramDescription": "should be scholix or Summary",           "paramRequired": true},
+  {"paramName":"mp",   "paramLongName":"maxPidNumberFilter", "paramDescription": "filter max number of pids in source/target", "paramRequired": false}
+
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml
@ -0,0 +1,10 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml
@ -0,0 +1,145 @@
+<workflow-app name="Create Scholix Dump" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the final graph path</description>
+        </property>
+        <property>
+            <name>relationFilter</name>
+            <description>Filter relation semantic</description>
+        </property>
+        <property>
+            <name>maxNumberOfPid</name>
+            <description>filter relation with at least #maxNumberOfPid</description>
+        </property>
+
+    </parameters>
+
+    <start to="ImportDatasetEntities"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ImportDatasetEntities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Import JSONRDD to Dataset kryo</name>
+            <class>eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=3000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--filterRelation</arg><arg>${relationFilter}</arg>
+        </spark>
+        <ok to="CreateSummaries"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="CreateSummaries">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Convert Entities to summaries</name>
+            <class>eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=20000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn</arg>
+            <arg>--sourcePath</arg><arg>${targetPath}/entities</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
+        </spark>
+        <ok to="CreateScholix"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CreateScholix">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Generate Scholix Dataset</name>
+            <class>eu.dnetlib.dhp.sx.graph.SparkCreateScholix</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=30000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn</arg>
+            <arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
+            <arg>--relationPath</arg><arg>${targetPath}/relation</arg>
+        </spark>
+        <ok to="DropJSONPath"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="DropJSONPath">
+        <fs>
+            <delete path='${targetPath}/json'/>
+            <mkdir path='${targetPath}/json/'/>
+        </fs>
+        <ok to="SerializeScholix"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="SerializeScholix">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Serialize scholix to JSON</name>
+            <class>eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=6000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn</arg>
+            <arg>--sourcePath</arg><arg>${targetPath}/provision/scholix/scholix</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/json/scholix_json</arg>
+            <arg>--objectType</arg><arg>scholix</arg>
+            <arg>--maxPidNumberFilter</arg><arg>maxNumberOfPid</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.sx.scholix.Scholix
 import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
+import eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson.toInt
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.SparkConf
@ -12,6 +13,14 @@ import org.slf4j.{Logger, LoggerFactory}

 object SparkConvertObjectToJson {

+  def toInt(s: String): Option[Int] = {
+    try {
+      Some(s.toInt)
+    } catch {
+      case e: Exception => None
+    }
+  }
+
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
@ -37,6 +46,8 @@ object SparkConvertObjectToJson {
    log.info(s"objectType  -> $objectType")
    val scholixUpdatePath = parser.get("scholixUpdatePath")
    log.info(s"scholixUpdatePath  -> $scholixUpdatePath")
+    val maxPidNumberFilter = parser.get("maxPidNumberFilter")
+    log.info(s"maxPidNumberFilter  -> $maxPidNumberFilter")

    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
    implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
@ -47,12 +58,22 @@ object SparkConvertObjectToJson {
      case "scholix" =>
        log.info("Serialize Scholix")
        val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
-        val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
-        d.union(u)
-          .repartition(8000)
-          .map(s => mapper.writeValueAsString(s))(Encoders.STRING)
-          .rdd
-          .saveAsTextFile(targetPath, classOf[GzipCodec])
+//        val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
+        if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) {
+          val mp = toInt(maxPidNumberFilter).get
+          d
+            .filter(s => (s.getSource.getIdentifier.size() <= mp) && (s.getTarget.getIdentifier.size() <= mp))
+            .map(s => mapper.writeValueAsString(s))(Encoders.STRING)
+            .rdd
+            .saveAsTextFile(targetPath, classOf[GzipCodec])
+        } else {
+          d
+            .repartition(8000)
+            .map(s => mapper.writeValueAsString(s))(Encoders.STRING)
+            .rdd
+            .saveAsTextFile(targetPath, classOf[GzipCodec])
+        }
+
      case "summary" =>
        log.info("Serialize Summary")
        val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
@ -4,9 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
 import org.apache.commons.io.IOUtils
+import org.apache.commons.lang3.StringUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
+
 import scala.collection.JavaConverters._

 object SparkConvertRDDtoDataset {
@ -34,6 +36,9 @@ object SparkConvertRDDtoDataset {
    val t = parser.get("targetPath")
    log.info(s"targetPath  -> $t")

+    val filterRelation = parser.get("filterRelation")
+    log.info(s"filterRelation  -> $filterRelation")
+
    val entityPath = s"$t/entities"
    val relPath = s"$t/relation"
    val mapper = new ObjectMapper()
@ -94,28 +99,44 @@ object SparkConvertRDDtoDataset {

    log.info("Converting Relation")

-    val relationSemanticFilter = List(
-//      "cites",
-//      "iscitedby",
-      "merges",
-      "ismergedin",
-      "HasAmongTopNSimilarDocuments",
-      "IsAmongTopNSimilarDocuments"
-    )
+    if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) {

-    val rddRelation = spark.sparkContext
-      .textFile(s"$sourcePath/relation")
-      .map(s => mapper.readValue(s, classOf[Relation]))
-      .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
-      .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
-      //filter OpenCitations relations
-      .filter(r =>
-        r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
-          "opencitations".equalsIgnoreCase(k.getValue)
+      val rddRelation = spark.sparkContext
+        .textFile(s"$sourcePath/relation")
+        .map(s => mapper.readValue(s, classOf[Relation]))
+        .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+        .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
+        //filter OpenCitations relations
+        .filter(r =>
+          r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
+            "opencitations".equalsIgnoreCase(k.getValue)
+          )
        )
+        .filter(r => r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation))
+      spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
+    } else {
+
+      val relationSemanticFilter = List(
+        "merges",
+        "ismergedin",
+        "HasAmongTopNSimilarDocuments",
+        "IsAmongTopNSimilarDocuments"
      )
-      .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
-    spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
+
+      val rddRelation = spark.sparkContext
+        .textFile(s"$sourcePath/relation")
+        .map(s => mapper.readValue(s, classOf[Relation]))
+        .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+        .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
+        //filter OpenCitations relations
+        .filter(r =>
+          r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
+            "opencitations".equalsIgnoreCase(k.getValue)
+          )
+        )
+        .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
+      spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
+    }

  }
 }