WIP: trying to find a way to build the records for the index

2020-01-16 12:02:28 +02:00 · 2020-01-16 12:02:28 +02:00 · 97c239ee0d
parent 7ba586d2e5
commit 97c239ee0d
9 changed files with 265 additions and 73 deletions
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -26,6 +26,11 @@
            <artifactId>commons-lang3</artifactId>
        </dependency>

+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
+
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-common</artifactId>
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
@ -1,5 +1,8 @@
 package eu.dnetlib.dhp.schema.oaf;

+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 import java.io.Serializable;

 public abstract class Oaf implements Serializable {
@ -23,4 +26,14 @@ public abstract class Oaf implements Serializable {
    public void setLastupdatetimestamp(Long lastupdatetimestamp) {
        this.lastupdatetimestamp = lastupdatetimestamp;
    }
+
+    @Override
+    public String toString() {
+        try {
+            return new ObjectMapper().writeValueAsString(this);
+        } catch (JsonProcessingException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
 }
--- a/dhp-workflows/dhp-graph-provision/job-override.properties
+++ b/dhp-workflows/dhp-graph-provision/job-override.properties
@ -1,3 +1,5 @@
-sparkDriverMemory=16G
-sparkExecutorMemory=16G
+sparkDriverMemory=7G
+sparkExecutorMemory=7G
+sparkExecutorMemoryOverhead=5G
 hive_db_name=claudio
+sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java
@ -0,0 +1,53 @@
+package eu.dnetlib.dhp.graph;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+import java.io.Serializable;
+
+public class EntityRelEntity implements Serializable {
+    private TypedRow source;
+    private Relation relation;
+    private TypedRow target;
+
+    public EntityRelEntity(TypedRow source) {
+        this.source = source;
+    }
+
+    public TypedRow getSource() {
+        return source;
+    }
+
+    public EntityRelEntity setSource(TypedRow source) {
+        this.source = source;
+        return this;
+    }
+
+    public Relation getRelation() {
+        return relation;
+    }
+
+    public EntityRelEntity setRelation(Relation relation) {
+        this.relation = relation;
+        return this;
+    }
+
+    public TypedRow getTarget() {
+        return target;
+    }
+
+    public EntityRelEntity setTarget(TypedRow target) {
+        this.target = target;
+        return this;
+    }
+
+    @Override
+    public String toString() {
+        try {
+            return new ObjectMapper().writeValueAsString(this);
+        } catch (JsonProcessingException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java
@ -0,0 +1,139 @@
+package eu.dnetlib.dhp.graph;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.io.Serializable;
+
+public class GraphJoiner implements Serializable {
+
+    public static final int MAX_RELS = 100;
+
+    public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) {
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+        /*
+        JavaPairRDD<String, TypedRow> entities = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
+                .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
+                .map(oaf -> new TypedRow("publication", oaf))
+                .mapToPair(toPair());
+
+         */
+
+        JavaPairRDD<String, TypedRow> entities = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class)
+                .map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class))
+                .map(oaf -> new TypedRow("datasource", oaf))
+                .mapToPair(toPair())
+            .union(sc.sequenceFile(inputPath + "/organization", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class))
+                    .map(oaf -> new TypedRow("organization", oaf))
+                    .mapToPair(toPair()))
+            .union(sc.sequenceFile(inputPath + "/project", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Project.class))
+                    .map(oaf -> new TypedRow("project", oaf))
+                    .mapToPair(toPair()))
+            .union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
+                    .map(oaf -> new TypedRow("dataset", oaf))
+                    .mapToPair(toPair()))
+            .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
+                    .map(oaf -> new TypedRow("otherresearchproduct", oaf))
+                    .mapToPair(toPair()))
+            .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
+                    .map(oaf -> new TypedRow("software", oaf))
+                    .mapToPair(toPair()));
+                /*
+            .union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
+                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
+                    .map(oaf -> new TypedRow("publication", oaf))
+                    .mapToPair(toPair()));
+
+         */
+
+        /*
+        JavaRDD<Relation> rels = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
+                .map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class))
+                .map(oaf -> new TypedRow("relation", oaf))
+                .mapToPair(toPair())
+                .groupByKey()
+                .map(t -> Iterables.limit(t._2(), MAX_RELS))
+                .flatMap(t -> t.iterator())
+                .map(t -> (Relation) t.getOaf());
+
+        spark.createDataset(rels.rdd(), Encoders.bean(Relation.class))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .saveAsTable(hiveDbName + ".relation_100");
+        */
+
+        JavaPairRDD<String, TypedRow> bounded_rels = spark.table(hiveDbName + ".relation_" + MAX_RELS)
+                .as(Encoders.bean(Relation.class))
+                .javaRDD()
+                .map(r -> new TypedRow("relation", r))
+                .mapToPair(toPair());
+
+        // build the adjacency list: e -> r
+        JavaPairRDD<String, Tuple2<TypedRow, Optional<TypedRow>>> adjacency_list = entities.leftOuterJoin(bounded_rels);
+
+        JavaRDD<EntityRelEntity> linked_entities = adjacency_list
+                .mapToPair(toPairTarget())          // make rel.targetid explicit so that we can join it
+                .leftOuterJoin(entities)            // again with the entities to get the target entity
+                .map(l -> toEntityRelEntity(l));    // and map it to a more readable representation
+
+        spark.createDataFrame(linked_entities, EntityRelEntity.class)
+                .write()
+                .mode(SaveMode.Overwrite)
+                .saveAsTable(hiveDbName + ".linked_entities");
+     }
+
+    private EntityRelEntity toEntityRelEntity(Tuple2<String, Tuple2<Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>, Optional<TypedRow>>> l) {
+        // extract the entity source
+        final EntityRelEntity res = new EntityRelEntity(l._2()._1()._2()._1());
+
+        if(l._2()._1()._2()._2().isPresent() && l._2()._2().isPresent()) {
+
+            // extract the relationship
+            res.setRelation((Relation) l._2()._1()._2()._2().get().getOaf());
+
+            // extract the related entity
+            res.setTarget(l._2()._2().get());
+        }
+
+        return res;
+    }
+
+    private PairFunction<Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>, String, Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>> toPairTarget() {
+        return e -> {
+            Optional<TypedRow> o = e._2()._2();
+            if (o.isPresent()) {
+                return new Tuple2<>(((Relation) o.get().getOaf()).getTarget(), e);
+            } else {
+                return new Tuple2<>(null, e);
+            }
+        };
+    }
+
+    private PairFunction<TypedRow, String, TypedRow> toPair() {
+        return e -> {
+            if (!"relation".equals(e.getType())) {
+                return new Tuple2<>( ((OafEntity) e.getOaf()).getId(), e);
+            } else {
+                return new Tuple2<>( ((Relation) e.getOaf()).getSource(), e);
+            }
+        };
+    }
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java
@ -1,32 +1,14 @@
 package eu.dnetlib.dhp.graph;

-import com.google.common.collect.Sets;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.common.EntityPayload;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
-import scala.Tuple2;
-import scala.runtime.AbstractFunction1;
-
-import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;
-
-import static org.apache.commons.lang3.StringUtils.substringAfter;
-import static org.apache.commons.lang3.StringUtils.substringBefore;
-import static org.apache.spark.sql.Encoders.bean;

 public class SparkGraphIndexingJob {

    private final static String ENTITY_NODES_PATH = "/tmp/entity_node";
-    private static final long LIMIT = 100;

    public static void main(String[] args) throws Exception {

@ -37,13 +19,10 @@ public class SparkGraphIndexingJob {
                .appName(SparkGraphIndexingJob.class.getSimpleName())
                .master(parser.get("master"))
                .config("hive.metastore.uris", parser.get("hive_metastore_uris"))
-                .config("spark.driver.cores", 1)
-                .config("spark.executor.cores", 1)
-                .config("spark.yarn.executor.memoryOverhead", "4G")
-                .config("spark.yarn.driver.memoryOverhead", "4G")
                .enableHiveSupport()
                .getOrCreate();

+        final String inputPath = parser.get("sourcePath");
        final String hiveDbName = parser.get("hive_db_name");

        final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
@ -51,52 +30,7 @@ public class SparkGraphIndexingJob {
            fs.delete(new Path(ENTITY_NODES_PATH), true);
        }

-        spark
-            .sql(getJoinEntitiesSQL(hiveDbName))
-            .transform(toEntityNode())
-            /*
-            .map((MapFunction<EntityNode, String>) r -> {
-                return null;
-            }, bean(String.class))
-            */
-            .rdd()
-
-            .saveAsTextFile(ENTITY_NODES_PATH, GzipCodec.class);
-    }
-
-    private static AbstractFunction1<Dataset<Row>, Dataset<EntityNode>> toEntityNode() {
-        return new AbstractFunction1<Dataset<Row>, Dataset<EntityNode>>() {
-            @Override
-            public Dataset<EntityNode> apply(Dataset<Row> d) {
-                return d.map((MapFunction<Row, EntityNode>) r -> {
-
-                    final List<String> res = r.getList(r.fieldIndex("related_entity"));
-                    final byte[] payload = r.getAs("payload");
-                    return new EntityNode(r.getAs("id"), r.getAs("type"), new String(payload))
-                            .setRelatedEntities(res
-                                    .stream()
-                                    .map(re -> new Tuple2<>(substringBefore(re, "@@"), substringAfter(re, "@@")))
-                                    .map(re -> new RelatedEntity(r.getAs("reltype"), r.getAs("subreltype"), r.getAs("relclass"), re._1(), re._2()))
-                                    .limit(LIMIT)
-                                    .collect(Collectors.toList()));
-
-                }, bean(EntityNode.class));
-            }
-        };
-    }
-
-    private static String getJoinEntitiesSQL(String hiveDbName) {
-        return String.format(
-                "SELECT " +
-                        "E_s.id AS id, " +
-                        "E_s.type AS type, " +
-                        "E_s.payload AS payload, " +
-                        "r.reltype AS reltype, r.subreltype AS subreltype, r.relclass AS relclass, " +
-                        "collect_list(concat(E_t.type, '@@', E_t.payload)) AS related_entity " +
-                "FROM %s.entities " + "" /*"TABLESAMPLE(0.1 PERCENT) "*/ + "E_s " +
-                        "LEFT JOIN %s.relation r ON (r.source = E_s.id) " +
-                        "JOIN %s.entities E_t ON (E_t.id = r.target) \n" +
-                "GROUP BY E_s.id, E_s.type, E_s.payload, r.reltype, r.subreltype, r.relclass", hiveDbName, hiveDbName, hiveDbName);
+        new GraphJoiner().join(spark, inputPath, hiveDbName, ENTITY_NODES_PATH);
    }

 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java
@ -0,0 +1,44 @@
+package eu.dnetlib.dhp.graph;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+
+import java.io.Serializable;
+
+public class TypedRow implements Serializable {
+    private String type;
+    private Oaf oaf;
+
+    public TypedRow(String type, Oaf oaf) {
+        this.type = type;
+        this.oaf = oaf;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public TypedRow setType(String type) {
+        this.type = type;
+        return this;
+    }
+
+    public Oaf getOaf() {
+        return oaf;
+    }
+
+    public TypedRow setOaf(Oaf oaf) {
+        this.oaf = oaf;
+        return this;
+    }
+
+    @Override
+    public String toString() {
+        try {
+            return new ObjectMapper().writeValueAsString(this);
+        } catch (JsonProcessingException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json
@ -1,5 +1,6 @@
 [
  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
  {"paramName":"h",   "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris",                                  "paramRequired": true},
-  {"paramName":"db",  "paramLongName":"hive_db_name",       "paramDescription": "the target hive database name",                            "paramRequired": true}
+  {"paramName":"db",  "paramLongName":"hive_db_name",       "paramDescription": "the target hive database name",                            "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml
@ -33,8 +33,9 @@
            <name>GraphIndexing</name>
            <class>eu.dnetlib.dhp.graph.SparkGraphIndexingJob</class>
            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
+            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.yarn.executor.memoryOverhead=${sparkExecutorMemoryOverhead}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
        </spark>