master set to 'yarn' in spark actions, removed path to rawSet from the dedup scan workflow

2020-03-25 14:16:06 +01:00 · 2020-03-25 14:16:06 +01:00 · 36f8f2ea66
parent efb0b7d660
commit 36f8f2ea66
4 changed files with 10 additions and 33 deletions
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java
@ -43,22 +43,17 @@ public class SparkCreateSimRels implements Serializable {
        //read oozie parameters
        final String graphBasePath = parser.get("graphBasePath");
        final String isLookUpUrl = parser.get("isLookUpUrl");
-        final String rawSet = parser.get("rawSet");
        final String actionSetId = parser.get("actionSetId");
        final String workingPath = parser.get("workingPath");

        System.out.println(String.format("graphBasePath: '%s'", graphBasePath));
        System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl));
-        System.out.println(String.format("rawSet: '%s'", rawSet));
        System.out.println(String.format("actionSetId: '%s'", actionSetId));
        System.out.println(String.format("workingPath: '%s'", workingPath));

        try (SparkSession spark = getSparkSession(parser)) {
            final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

-            //create empty sequenceFile for the accumulation
-            JavaRDD<Tuple2<Text,Text>> simRel = sc.emptyRDD();
-
            //for each dedup configuration
            for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
                final String entity = dedupConf.getWf().getEntityType();
@ -83,23 +78,16 @@ public class SparkCreateSimRels implements Serializable {
                        .write()
                        .mode("overwrite")
                        .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
-
-                if (rawSet != null) {
-                    //create atomic actions
-                    JavaRDD<Tuple2<Text, Text>> newSimRels = relationsRDD
-                            .map(this::createSequenceFileRow);
-
-                    simRel = simRel.union(newSimRels);
-                }
            }
-
-            if (rawSet != null)
-                simRel.mapToPair(r -> r)
-                    .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
        }
-
    }

+    /**
+     * Utility method used to create an atomic action from a Relation object
+     * @param relation input relation
+     * @return A tuple2 with [id, json serialization of the atomic action]
+     * @throws JsonProcessingException
+     */
    public Tuple2<Text, Text> createSequenceFileRow(Relation relation) throws JsonProcessingException {

        ObjectMapper mapper = new ObjectMapper();
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml
@ -65,7 +65,7 @@
                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>-mt</arg><arg>yarn</arg>
            <arg>--i</arg><arg>${graphBasePath}</arg>
            <arg>--w</arg><arg>${workingPath}</arg>
            <arg>--o</arg><arg>${dedupGraphPath}</arg>
@ -92,7 +92,7 @@
                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>-mt</arg><arg>yarn</arg>
            <arg>--i</arg><arg>${graphBasePath}</arg>
            <arg>--o</arg><arg>${dedupGraphPath}</arg>
            <arg>--w</arg><arg>${workingPath}</arg>
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json
@ -23,12 +23,6 @@
    "paramDescription": "the base path of the raw graph",
    "paramRequired": true
  },
-  {
-    "paramName": "o",
-    "paramLongName": "rawSet",
-    "paramDescription": "the raw set to be saved (full path)",
-    "paramRequired": false
-  },
  {
    "paramName": "w",
    "paramLongName": "workingPath",
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml
@ -4,10 +4,6 @@
            <name>graphBasePath</name>
            <description>the raw graph base path</description>
        </property>
-        <property>
-            <name>rawSet</name>
-            <description>the output directory in the targetPath</description>
-        </property>
        <property>
            <name>isLookUpUrl</name>
            <description>the address of the lookUp service</description>
@ -58,7 +54,6 @@
    <action name="CreateSimRel">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <prepare>
-                <delete path="${rawSet}"/>
                <delete path="${workingPath}/${actionSetId}/*_simrel"/>
            </prepare>
            <master>yarn</master>
@ -101,7 +96,7 @@
                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>-mt</arg><arg>yarn</arg>
            <arg>--i</arg><arg>${graphBasePath}</arg>
            <arg>--w</arg><arg>${workingPath}</arg>
            <arg>--la</arg><arg>${isLookUpUrl}</arg>
@ -129,7 +124,7 @@
                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>-mt</arg><arg>yarn</arg>
            <arg>--i</arg><arg>${graphBasePath}</arg>
            <arg>--w</arg><arg>${workingPath}</arg>
            <arg>--la</arg><arg>${isLookUpUrl}</arg>