dataset based provision WIP, fixed spark2EventLogDir

2020-04-02 16:15:50 +02:00 · 2020-04-02 16:15:50 +02:00 · daa26acc9d
parent 9c7092416a
commit daa26acc9d
2 changed files with 30 additions and 20 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@ -86,7 +86,6 @@ public class GraphJoiner_v2 implements Serializable {
        Dataset<TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
        // create the union between all the entities
        Dataset<Tuple2<String, TypedRow>> entities =
         datasource
                .union(organization)
                .union(project)
@ -94,16 +93,29 @@ public class GraphJoiner_v2 implements Serializable {
                .union(otherresearchproduct)
                .union(software)
                .union(publication)
-                .map((MapFunction<TypedRow, Tuple2<String, TypedRow>>) value -> new Tuple2<>(
+        .repartition(20000)
-                        value.getId(),
+        .write()
-                        value),
+        .parquet(getOutPath() + "/entities");
                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
                .cache();
        Dataset<Tuple2<String, TypedRow>> entities = getSpark()
                .read()
                .load(getOutPath() + "/entities")
                .map((MapFunction<Row, Tuple2<String, TypedRow>>) r -> {
                    TypedRow t = new TypedRow();
                    t.setId(r.getAs("id"));
                    t.setDeleted(r.getAs("deleted"));
                    t.setType(r.getAs("type"));
                    t.setOaf(r.getAs("oaf"));
                    return new Tuple2<>(t.getId(), t);
                }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)));
        System.out.println("Entities, number of partitions: " + entities.rdd().getNumPartitions());
        System.out.println("Entities schema:");
        entities.printSchema();
        // reads the relationships
 /*
        // reads the relationships
        Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
                .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
@ -126,7 +138,7 @@ public class GraphJoiner_v2 implements Serializable {
                    e.setRelation(t._1()._2());
                    e.setTarget(asRelatedEntity(t._2()._2()));
                    return e;
-                }, Encoders.bean(EntityRelEntity.class))
+                }, Encoders.kryo(EntityRelEntity.class))
                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
@ -160,13 +172,11 @@ public class GraphJoiner_v2 implements Serializable {
        final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
        grouped
                .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
                .write()
                .text(getOutPath() + "/xml");
        /*
                .javaRDD()
                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> new Tuple2<>(t._1(), t._2()))
                .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
 */
        return this;
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
@ -25,6 +25,6 @@
    </property>
    <property>
        <name>spark2EventLogDir</name>
-        <value>/user/spark/applicationHistory</value>
+        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
 </configuration>