forked from antonis.lempesis/dnet-hadoop
joining entities using T x R x S method with groupByKey
This commit is contained in:
parent
1cd6899480
commit
799929c1e3
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.1.5-SNAPSHOT</version>
|
<version>1.0.5-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -1,118 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Entity implements Serializable {
|
|
||||||
|
|
||||||
private String identifier;
|
|
||||||
|
|
||||||
private List<Pid> pid;
|
|
||||||
|
|
||||||
private List<String> title;
|
|
||||||
|
|
||||||
private List<String> date;
|
|
||||||
|
|
||||||
private String typology;
|
|
||||||
|
|
||||||
private List<String> authors;
|
|
||||||
|
|
||||||
private List<Subject> subject;
|
|
||||||
|
|
||||||
private String description;
|
|
||||||
|
|
||||||
private String completionStatus;
|
|
||||||
|
|
||||||
private List<Provenance> collectedFrom;
|
|
||||||
|
|
||||||
private List<String> publisher;
|
|
||||||
|
|
||||||
|
|
||||||
public String getIdentifier() {
|
|
||||||
return identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setIdentifier(String identifier) {
|
|
||||||
this.identifier = identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Pid> getPid() {
|
|
||||||
return pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPid(List<Pid> pid) {
|
|
||||||
this.pid = pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getTitle() {
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTitle(List<String> title) {
|
|
||||||
this.title = title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getDate() {
|
|
||||||
return date;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDate(List<String> date) {
|
|
||||||
this.date = date;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTypology() {
|
|
||||||
return typology;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTypology(String typology) {
|
|
||||||
this.typology = typology;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getAuthors() {
|
|
||||||
return authors;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setAuthors(List<String> authors) {
|
|
||||||
this.authors = authors;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Subject> getSubject() {
|
|
||||||
return subject;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSubject(List<Subject> subject) {
|
|
||||||
this.subject = subject;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDescription() {
|
|
||||||
return description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDescription(String description) {
|
|
||||||
this.description = description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Provenance> getCollectedFrom() {
|
|
||||||
return collectedFrom;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCollectedFrom(List<Provenance> collectedFrom) {
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getPublisher() {
|
|
||||||
return publisher;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPublisher(List<String> publisher) {
|
|
||||||
this.publisher = publisher;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCompletionStatus() {
|
|
||||||
return completionStatus;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCompletionStatus(String completionStatus) {
|
|
||||||
this.completionStatus = completionStatus;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
public class Pid {
|
|
||||||
|
|
||||||
private String pid;
|
|
||||||
|
|
||||||
private String pidType;
|
|
||||||
|
|
||||||
public String getPid() {
|
|
||||||
return pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPid(String pid) {
|
|
||||||
this.pid = pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getPidType() {
|
|
||||||
return pidType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPidType(String pidType) {
|
|
||||||
this.pidType = pidType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String generateId() {
|
|
||||||
if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType))
|
|
||||||
return null;
|
|
||||||
return DHPUtils.md5(String.format("%s::%s", pid, pidType));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
public class Provenance {
|
|
||||||
|
|
||||||
private String datasourceId;
|
|
||||||
|
|
||||||
private String datasourceName;
|
|
||||||
|
|
||||||
private String completionStatus;
|
|
||||||
|
|
||||||
|
|
||||||
public String getDatasourceId() {
|
|
||||||
return datasourceId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDatasourceId(String datasourceId) {
|
|
||||||
this.datasourceId = datasourceId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDatasourceName() {
|
|
||||||
return datasourceName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDatasourceName(String datasourceName) {
|
|
||||||
this.datasourceName = datasourceName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCompletionStatus() {
|
|
||||||
return completionStatus;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCompletionStatus(String completionStatus) {
|
|
||||||
this.completionStatus = completionStatus;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,47 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Relation implements Serializable {
|
|
||||||
|
|
||||||
private String source;
|
|
||||||
|
|
||||||
private String target;
|
|
||||||
|
|
||||||
private List<Provenance> provenance;
|
|
||||||
|
|
||||||
private RelationSemantic semantic;
|
|
||||||
|
|
||||||
public String getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSource(String source) {
|
|
||||||
this.source = source;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTarget() {
|
|
||||||
return target;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTarget(String target) {
|
|
||||||
this.target = target;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Provenance> getProvenance() {
|
|
||||||
return provenance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setProvenance(List<Provenance> provenance) {
|
|
||||||
this.provenance = provenance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelationSemantic getSemantic() {
|
|
||||||
return semantic;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSemantic(RelationSemantic semantic) {
|
|
||||||
this.semantic = semantic;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,16 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class RelationSemantic extends Subject implements Serializable {
|
|
||||||
|
|
||||||
public String inverse;
|
|
||||||
|
|
||||||
public String getInverse() {
|
|
||||||
return inverse;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setInverse(String inverse) {
|
|
||||||
this.inverse = inverse;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
package eu.dnetlib.dhp.schema.dli;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class Subject implements Serializable {
|
|
||||||
|
|
||||||
private String schema;
|
|
||||||
|
|
||||||
private String value;
|
|
||||||
|
|
||||||
public Subject() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public Subject(String schema, String value) {
|
|
||||||
this.schema = schema;
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSchema() {
|
|
||||||
return schema;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSchema(String schema) {
|
|
||||||
this.schema = schema;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getValue() {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setValue(String value) {
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,5 +1,4 @@
|
||||||
sparkDriverMemory=7G
|
sparkDriverMemory=7G
|
||||||
sparkExecutorMemory=7G
|
sparkExecutorMemory=7G
|
||||||
sparkExecutorMemoryOverhead=5G
|
|
||||||
hive_db_name=claudio
|
hive_db_name=claudio
|
||||||
sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06
|
sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06
|
|
@ -19,6 +19,10 @@
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
|
<artifactId>json-path</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
|
||||||
|
|
||||||
public class EntityNode {
|
|
||||||
}
|
|
|
@ -1,20 +1,30 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class EntityRelEntity implements Serializable {
|
public class EntityRelEntity implements Serializable {
|
||||||
private TypedRow source;
|
private TypedRow source;
|
||||||
private Relation relation;
|
private TypedRow relation;
|
||||||
private TypedRow target;
|
private TypedRow target;
|
||||||
|
|
||||||
|
public EntityRelEntity() {
|
||||||
|
}
|
||||||
|
|
||||||
public EntityRelEntity(TypedRow source) {
|
public EntityRelEntity(TypedRow source) {
|
||||||
this.source = source;
|
this.source = source;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//helpers
|
||||||
|
public Boolean hasMainEntity() {
|
||||||
|
return getSource() != null & getRelation() == null & getTarget() == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean hasRelatedEntity() {
|
||||||
|
return getSource() == null & getRelation() != null & getTarget() != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public TypedRow getSource() {
|
public TypedRow getSource() {
|
||||||
return source;
|
return source;
|
||||||
}
|
}
|
||||||
|
@ -24,11 +34,11 @@ public class EntityRelEntity implements Serializable {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Relation getRelation() {
|
public TypedRow getRelation() {
|
||||||
return relation;
|
return relation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public EntityRelEntity setRelation(Relation relation) {
|
public EntityRelEntity setRelation(TypedRow relation) {
|
||||||
this.relation = relation;
|
this.relation = relation;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -42,12 +52,4 @@ public class EntityRelEntity implements Serializable {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
try {
|
|
||||||
return new ObjectMapper().writeValueAsString(this);
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,139 +1,119 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.jayway.jsonpath.JsonPath;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.Optional;
|
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class GraphJoiner implements Serializable {
|
public class GraphJoiner implements Serializable {
|
||||||
|
|
||||||
public static final int MAX_RELS = 100;
|
public static final int MAX_RELS = 10;
|
||||||
|
|
||||||
public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) {
|
public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) {
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
/*
|
final String entityIdPath = "$.id";
|
||||||
JavaPairRDD<String, TypedRow> entities = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
|
|
||||||
.map(oaf -> new TypedRow("publication", oaf))
|
|
||||||
.mapToPair(toPair());
|
|
||||||
|
|
||||||
*/
|
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, entityIdPath, inputPath, "datasource");
|
||||||
|
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, entityIdPath, inputPath, "organization");
|
||||||
|
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, entityIdPath, inputPath, "project");
|
||||||
|
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, entityIdPath, inputPath, "dataset");
|
||||||
|
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, entityIdPath, inputPath, "otherresearchproduct");
|
||||||
|
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, entityIdPath, inputPath, "software");
|
||||||
|
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, entityIdPath, inputPath, "publication");
|
||||||
|
|
||||||
JavaPairRDD<String, TypedRow> entities = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class)
|
final String entitiesPath = outPath + "/entities";
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class))
|
datasource
|
||||||
.map(oaf -> new TypedRow("datasource", oaf))
|
.union(organization)
|
||||||
.mapToPair(toPair())
|
.union(project)
|
||||||
.union(sc.sequenceFile(inputPath + "/organization", Text.class, Text.class)
|
.union(dataset)
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class))
|
.union(otherresearchproduct)
|
||||||
.map(oaf -> new TypedRow("organization", oaf))
|
.union(software)
|
||||||
.mapToPair(toPair()))
|
.union(publication)
|
||||||
.union(sc.sequenceFile(inputPath + "/project", Text.class, Text.class)
|
.map(e -> new EntityRelEntity().setSource(e._2()))
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Project.class))
|
.map(e -> new ObjectMapper().writeValueAsString(e))
|
||||||
.map(oaf -> new TypedRow("project", oaf))
|
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
||||||
.mapToPair(toPair()))
|
|
||||||
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
|
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
|
|
||||||
.map(oaf -> new TypedRow("dataset", oaf))
|
|
||||||
.mapToPair(toPair()))
|
|
||||||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
|
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
|
|
||||||
.map(oaf -> new TypedRow("otherresearchproduct", oaf))
|
|
||||||
.mapToPair(toPair()))
|
|
||||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
|
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
|
|
||||||
.map(oaf -> new TypedRow("software", oaf))
|
|
||||||
.mapToPair(toPair()));
|
|
||||||
/*
|
|
||||||
.union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
|
|
||||||
.map(oaf -> new TypedRow("publication", oaf))
|
|
||||||
.mapToPair(toPair()));
|
|
||||||
|
|
||||||
*/
|
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
|
||||||
|
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||||
|
.mapToPair(t -> new Tuple2<>(t.getSource().getSource(), t));
|
||||||
|
|
||||||
/*
|
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, inputPath)
|
||||||
JavaRDD<Relation> rels = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
|
.map(p -> new EntityRelEntity().setRelation(p))
|
||||||
.map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class))
|
.mapToPair(p -> new Tuple2<>(p.getRelation().getSource(), p))
|
||||||
.map(oaf -> new TypedRow("relation", oaf))
|
|
||||||
.mapToPair(toPair())
|
|
||||||
.groupByKey()
|
.groupByKey()
|
||||||
.map(t -> Iterables.limit(t._2(), MAX_RELS))
|
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
||||||
.flatMap(t -> t.iterator())
|
.flatMap(p -> p.iterator())
|
||||||
.map(t -> (Relation) t.getOaf());
|
.mapToPair(p -> new Tuple2<>(p.getRelation().getTarget(), p));
|
||||||
|
|
||||||
spark.createDataset(rels.rdd(), Encoders.bean(Relation.class))
|
final String joinByTargetPath = outPath + "/join_by_target";
|
||||||
.write()
|
relation.join(entities)
|
||||||
.mode(SaveMode.Overwrite)
|
.map(s -> new EntityRelEntity()
|
||||||
.saveAsTable(hiveDbName + ".relation_100");
|
.setRelation(s._2()._1().getRelation())
|
||||||
*/
|
.setTarget(s._2()._2().getSource()))
|
||||||
|
.map(e -> new ObjectMapper().writeValueAsString(e))
|
||||||
|
.saveAsTextFile(joinByTargetPath, GzipCodec.class);
|
||||||
|
|
||||||
JavaPairRDD<String, TypedRow> bounded_rels = spark.table(hiveDbName + ".relation_" + MAX_RELS)
|
|
||||||
.as(Encoders.bean(Relation.class))
|
|
||||||
.javaRDD()
|
|
||||||
.map(r -> new TypedRow("relation", r))
|
|
||||||
.mapToPair(toPair());
|
|
||||||
|
|
||||||
// build the adjacency list: e -> r
|
JavaPairRDD<String, EntityRelEntity> bySource = sc.textFile(joinByTargetPath)
|
||||||
JavaPairRDD<String, Tuple2<TypedRow, Optional<TypedRow>>> adjacency_list = entities.leftOuterJoin(bounded_rels);
|
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||||
|
.mapToPair(t -> new Tuple2<>(t.getRelation().getSource(), t));
|
||||||
|
|
||||||
JavaRDD<EntityRelEntity> linked_entities = adjacency_list
|
entities
|
||||||
.mapToPair(toPairTarget()) // make rel.targetid explicit so that we can join it
|
.union(bySource)
|
||||||
.leftOuterJoin(entities) // again with the entities to get the target entity
|
.groupByKey() // by source id
|
||||||
.map(l -> toEntityRelEntity(l)); // and map it to a more readable representation
|
.map(p -> {
|
||||||
|
final LinkedEntity e = new LinkedEntity();
|
||||||
spark.createDataFrame(linked_entities, EntityRelEntity.class)
|
final List<Tuple> links = Lists.newArrayList();
|
||||||
.write()
|
for(EntityRelEntity rel : p._2()) {
|
||||||
.mode(SaveMode.Overwrite)
|
if (rel.hasMainEntity() & e.getEntity() == null) {
|
||||||
.saveAsTable(hiveDbName + ".linked_entities");
|
e.setEntity(rel.getSource());
|
||||||
|
}
|
||||||
|
if (rel.hasRelatedEntity()) {
|
||||||
|
links.add(new Tuple()
|
||||||
|
.setRelation(rel.getRelation())
|
||||||
|
.setTarget(rel.getTarget()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e.setLinks(links);
|
||||||
|
if (e.getEntity() == null) {
|
||||||
|
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
||||||
|
}
|
||||||
|
return e;
|
||||||
|
})
|
||||||
|
.map(e -> new ObjectMapper().writeValueAsString(e))
|
||||||
|
.saveAsTextFile(outPath + "/linked_entities", GzipCodec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private EntityRelEntity toEntityRelEntity(Tuple2<String, Tuple2<Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>, Optional<TypedRow>>> l) {
|
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String idPath, final String inputPath, final String type) {
|
||||||
// extract the entity source
|
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class)
|
||||||
final EntityRelEntity res = new EntityRelEntity(l._2()._1()._2()._1());
|
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> {
|
||||||
|
final String json = item._2().toString();
|
||||||
if(l._2()._1()._2()._2().isPresent() && l._2()._2().isPresent()) {
|
final String id = JsonPath.read(json, idPath);
|
||||||
|
return new Tuple2<>(id, new TypedRow(id, type, json));
|
||||||
// extract the relationship
|
});
|
||||||
res.setRelation((Relation) l._2()._1()._2()._2().get().getOaf());
|
|
||||||
|
|
||||||
// extract the related entity
|
|
||||||
res.setTarget(l._2()._2().get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return res;
|
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
|
||||||
}
|
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
|
||||||
|
.map(item -> {
|
||||||
private PairFunction<Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>, String, Tuple2<String, Tuple2<TypedRow, Optional<TypedRow>>>> toPairTarget() {
|
final String json = item._2().toString();
|
||||||
return e -> {
|
final String source = JsonPath.read(json, "$.source");
|
||||||
Optional<TypedRow> o = e._2()._2();
|
final String target = JsonPath.read(json, "$.target");
|
||||||
if (o.isPresent()) {
|
return new TypedRow(source, target, "relation", json);
|
||||||
return new Tuple2<>(((Relation) o.get().getOaf()).getTarget(), e);
|
});
|
||||||
} else {
|
|
||||||
return new Tuple2<>(null, e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private PairFunction<TypedRow, String, TypedRow> toPair() {
|
|
||||||
return e -> {
|
|
||||||
if (!"relation".equals(e.getType())) {
|
|
||||||
return new Tuple2<>( ((OafEntity) e.getOaf()).getId(), e);
|
|
||||||
} else {
|
|
||||||
return new Tuple2<>( ((Relation) e.getOaf()).getSource(), e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class LinkedEntity implements Serializable {
|
||||||
|
|
||||||
|
private TypedRow entity;
|
||||||
|
|
||||||
|
private List<Tuple> links;
|
||||||
|
|
||||||
|
public TypedRow getEntity() {
|
||||||
|
return entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LinkedEntity setEntity(TypedRow entity) {
|
||||||
|
this.entity = entity;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Tuple> getLinks() {
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LinkedEntity setLinks(List<Tuple> links) {
|
||||||
|
this.links = links;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,69 +0,0 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class RelatedEntity implements Serializable {
|
|
||||||
|
|
||||||
private String relType;
|
|
||||||
|
|
||||||
private String subRelType;
|
|
||||||
|
|
||||||
private String relClass;
|
|
||||||
|
|
||||||
private String type;
|
|
||||||
|
|
||||||
private String payload;
|
|
||||||
|
|
||||||
public RelatedEntity(String relType, String subRelType, String relClass, String type, String payload) {
|
|
||||||
this.relType = relType;
|
|
||||||
this.subRelType = subRelType;
|
|
||||||
this.relClass = relClass;
|
|
||||||
this.type = type;
|
|
||||||
this.payload = payload;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRelType() {
|
|
||||||
return relType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelatedEntity setRelType(String relType) {
|
|
||||||
this.relType = relType;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSubRelType() {
|
|
||||||
return subRelType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelatedEntity setSubRelType(String subRelType) {
|
|
||||||
this.subRelType = subRelType;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRelClass() {
|
|
||||||
return relClass;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelatedEntity setRelClass(String relClass) {
|
|
||||||
this.relClass = relClass;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getType() {
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelatedEntity setType(String type) {
|
|
||||||
this.type = type;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getPayload() {
|
|
||||||
return payload;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RelatedEntity setPayload(String payload) {
|
|
||||||
this.payload = payload;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -4,21 +4,27 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
public class SparkGraphIndexingJob {
|
public class SparkGraphIndexingJob {
|
||||||
|
|
||||||
private final static String ENTITY_NODES_PATH = "/tmp/entity_node";
|
private final static String OUTPUT_BASE_PATH = "/tmp/openaire_provision";
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final SparkConf conf = new SparkConf()
|
||||||
|
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||||
|
.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
final SparkSession spark = SparkSession
|
final SparkSession spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(SparkGraphIndexingJob.class.getSimpleName())
|
.appName(SparkGraphIndexingJob.class.getSimpleName())
|
||||||
.master(parser.get("master"))
|
.master(parser.get("master"))
|
||||||
.config("hive.metastore.uris", parser.get("hive_metastore_uris"))
|
|
||||||
.enableHiveSupport()
|
.enableHiveSupport()
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
|
|
||||||
|
@ -26,11 +32,12 @@ public class SparkGraphIndexingJob {
|
||||||
final String hiveDbName = parser.get("hive_db_name");
|
final String hiveDbName = parser.get("hive_db_name");
|
||||||
|
|
||||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||||
if (fs.exists(new Path(ENTITY_NODES_PATH))) {
|
if (fs.exists(new Path(OUTPUT_BASE_PATH))) {
|
||||||
fs.delete(new Path(ENTITY_NODES_PATH), true);
|
fs.delete(new Path(OUTPUT_BASE_PATH), true);
|
||||||
|
fs.mkdirs(new Path(OUTPUT_BASE_PATH));
|
||||||
}
|
}
|
||||||
|
|
||||||
new GraphJoiner().join(spark, inputPath, hiveDbName, ENTITY_NODES_PATH);
|
new GraphJoiner().join(spark, inputPath, hiveDbName, OUTPUT_BASE_PATH);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Tuple implements Serializable {
|
||||||
|
|
||||||
|
private TypedRow relation;
|
||||||
|
|
||||||
|
private TypedRow target;
|
||||||
|
|
||||||
|
public TypedRow getRelation() {
|
||||||
|
return relation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Tuple setRelation(TypedRow relation) {
|
||||||
|
this.relation = relation;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow getTarget() {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Tuple setTarget(TypedRow target) {
|
||||||
|
this.target = target;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,20 +1,46 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class TypedRow implements Serializable {
|
public class TypedRow implements Serializable {
|
||||||
private String type;
|
|
||||||
private Oaf oaf;
|
|
||||||
|
|
||||||
public TypedRow(String type, Oaf oaf) {
|
private String source;
|
||||||
|
private String target;
|
||||||
|
private String type;
|
||||||
|
private String oaf;
|
||||||
|
|
||||||
|
public TypedRow() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow(String source, String type, String oaf) {
|
||||||
|
this.source = source;
|
||||||
this.type = type;
|
this.type = type;
|
||||||
this.oaf = oaf;
|
this.oaf = oaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TypedRow(String source, String target, String type, String oaf) {
|
||||||
|
this(source, type, oaf);
|
||||||
|
this.target = target;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSource() {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setSource(String source) {
|
||||||
|
this.source = source;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTarget() {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setTarget(String target) {
|
||||||
|
this.target = target;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public String getType() {
|
public String getType() {
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
@ -24,21 +50,13 @@ public class TypedRow implements Serializable {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Oaf getOaf() {
|
public String getOaf() {
|
||||||
return oaf;
|
return oaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TypedRow setOaf(Oaf oaf) {
|
public TypedRow setOaf(String oaf) {
|
||||||
this.oaf = oaf;
|
this.oaf = oaf;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
try {
|
|
||||||
return new ObjectMapper().writeValueAsString(this);
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,12 @@
|
||||||
<name>hive_db_name</name>
|
<name>hive_db_name</name>
|
||||||
<value>openaire</value>
|
<value>openaire</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/applicationHistory</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -16,6 +16,14 @@
|
||||||
<name>sparkExecutorCores</name>
|
<name>sparkExecutorCores</name>
|
||||||
<description>number of cores used by single executor</description>
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="GraphJoinEntities"/>
|
<start to="GraphJoinEntities"/>
|
||||||
|
@ -33,7 +41,16 @@
|
||||||
<name>GraphIndexing</name>
|
<name>GraphIndexing</name>
|
||||||
<class>eu.dnetlib.dhp.graph.SparkGraphIndexingJob</class>
|
<class>eu.dnetlib.dhp.graph.SparkGraphIndexingJob</class>
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.yarn.executor.memoryOverhead=${sparkExecutorMemoryOverhead}</spark-opts>
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||||
|
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||||
|
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
|
<arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
|
||||||
|
|
Loading…
Reference in New Issue