dataset based provision WIP

2020-04-01 19:07:30 +02:00 · 2020-04-01 19:07:30 +02:00 · 9c7092416a
parent 1402eb1fe7
commit 9c7092416a
5 changed files with 32 additions and 40 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@ -45,19 +45,12 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntit
 */
 public class GraphJoiner_v2 implements Serializable {

-    public static final int LIMIT = 1000000;
    private Map<String, LongAccumulator> accumulators = Maps.newHashMap();

    public static final int MAX_RELS = 100;

    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";

-    private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
-            Arrays.asList(
-                    StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
-                    StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
-            ));
-
    private SparkSession spark;

    private ContextMapper contextMapper;
@ -105,7 +98,6 @@ public class GraphJoiner_v2 implements Serializable {
                        value.getId(),
                        value),
                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
-                .limit(LIMIT)
                .cache();

        System.out.println("Entities schema:");
@ -115,7 +107,6 @@ public class GraphJoiner_v2 implements Serializable {
        Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
                .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
-                .limit(LIMIT)
                .cache();

        System.out.println("Relation schema:");
@ -169,7 +160,6 @@ public class GraphJoiner_v2 implements Serializable {
        final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
        grouped
                .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
-                .limit(LIMIT)
                .write()
                .text(getOutPath() + "/xml");
        /*
@ -245,13 +235,11 @@ public class GraphJoiner_v2 implements Serializable {
     * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
     */
    private Dataset<TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
-        RDD<Row> rdd = sc.textFile(inputPath + "/" + type)
-                .map((Function<String, Row>) s -> RowFactory.create("", s))
+        RDD<String> rdd = sc.textFile(inputPath + "/" + type)
                .rdd();

-        return getSpark().createDataFrame(rdd, KV_SCHEMA)
-                .map((MapFunction<Row, TypedRow>) row -> {
-                    final String s = row.getAs("value");
+        return getSpark().createDataset(rdd, Encoders.STRING())
+                .map((MapFunction<String, TypedRow>) s -> {
                    final DocumentContext json = JsonPath.parse(s);
                    final TypedRow t = new TypedRow();
                    t.setId(json.read("$.id"));
@ -270,12 +258,11 @@ public class GraphJoiner_v2 implements Serializable {
     * @return the JavaRDD<TypedRow> containing all the relationships
     */
    private Dataset<Relation> readPathRelation(final JavaSparkContext sc, final String inputPath) {
-        final RDD<Row> rdd = sc.textFile(inputPath + "/relation")
-                .map((Function<String, Row>) s -> RowFactory.create("", s))
+        final RDD<String> rdd = sc.textFile(inputPath + "/relation")
                .rdd();

-        return getSpark().createDataFrame(rdd, KV_SCHEMA)
-                .map((MapFunction<Row, Relation>) value -> new ObjectMapper().readValue(value.<String>getAs("value"), Relation.class), Encoders.bean(Relation.class));
+        return getSpark().createDataset(rdd, Encoders.STRING())
+                .map((MapFunction<String, Relation>) s -> new ObjectMapper().readValue(s, Relation.class), Encoders.bean(Relation.class));
    }

    private ObjectMapper getObjectMapper() {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
@ -17,23 +17,23 @@ public class SparkXmlRecordBuilderJob_v2 {
                        SparkXmlRecordBuilderJob_v2.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
        parser.parseArgument(args);

-        final String master = parser.get("master");
-        try(SparkSession spark = getSession(master)) {
+        try(SparkSession spark = getSession(parser)) {

            final String inputPath = parser.get("sourcePath");
            final String outputPath = parser.get("outputPath");
            final String isLookupUrl = parser.get("isLookupUrl");
            final String otherDsTypeId = parser.get("otherDsTypeId");

+
            new GraphJoiner_v2(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
                    .adjacencyLists();
        }
    }

-    private static SparkSession getSession(String master) {
+    private static SparkSession getSession(ArgumentApplicationParser parser) {
        final SparkConf conf = new SparkConf();
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
-        conf.set("spark.sql.shuffle.partitions", "500");
+        conf.set("spark.sql.shuffle.partitions", parser.get("sparkSqlShufflePartitions"));
        conf.registerKryoClasses(new Class[]{
                Author.class,
                Context.class,
@ -74,7 +74,7 @@ public class SparkXmlRecordBuilderJob_v2 {
                .builder()
                .config(conf)
                .appName(SparkXmlRecordBuilderJob_v2.class.getSimpleName())
-                .master(master)
+                .master(parser.get("master"))
                .getOrCreate();
    }

--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
@ -3,5 +3,6 @@
  {"paramName":"is",  "paramLongName":"isLookupUrl",     "paramDescription": "URL of the isLookUp Service",                                 "paramRequired": true},
  {"paramName":"o",   "paramLongName":"outputPath",      "paramDescription": "the path used to store temporary output files",               "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath",      "paramDescription": "the path of the sequence file to read",                       "paramRequired": true},
-  {"paramName":"t",   "paramLongName":"otherDsTypeId",   "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
+  {"paramName":"t",   "paramLongName":"otherDsTypeId",   "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true},
+  {"paramName":"sp",  "paramLongName":"sparkSqlShufflePartitions",   "paramDescription": "Configures the number of partitions to use when shuffling data for joins or aggregations", "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
@ -19,13 +19,9 @@
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
-    <property>
-        <name>hive_db_name</name>
-        <value>openaire</value>
-    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -2,19 +2,27 @@

    <parameters>
        <property>
-            <name>hive_db_name</name>
-            <description>the target hive database name</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
+            <name>sparkDriverMemoryForJoining</name>
            <description>memory for driver process</description>
        </property>
        <property>
-            <name>sparkExecutorMemory</name>
+            <name>sparkExecutorMemoryForJoining</name>
            <description>memory for individual executor</description>
        </property>
        <property>
-            <name>sparkExecutorCores</name>
+            <name>sparkExecutorCoresForJoining</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>sparkDriverMemoryForIndexing</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemoryForIndexing</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCoresForIndexing</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
@ -75,13 +83,13 @@
                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.network.timeout=10000000
            </spark-opts>
            <arg>-mt</arg> <arg>yarn</arg>
            <arg>-is</arg> <arg>${isLookupUrl}</arg>
            <arg>-t</arg> <arg>${otherDsTypeId}</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>-s</arg><arg>${sourcePath}</arg>
+            <arg>-o</arg><arg>${outputPath}</arg>
+            <arg>-sp</arg><arg>${sparkSqlShufflePartitions}</arg>
        </spark>
        <ok to="to_solr_index"/>
        <error to="Kill"/>