forked from antonis.lempesis/dnet-hadoop
dataset based provision WIP
This commit is contained in:
parent
c8f4b95464
commit
ca345aaad3
|
@ -3,31 +3,25 @@ package eu.dnetlib.dhp.oa.provision;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.*;
|
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
|
import eu.dnetlib.dhp.oa.provision.model.Tuple2;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.*;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.sql.catalyst.expressions.Encode;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Spliterator;
|
|
||||||
import java.util.Spliterators;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
|
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
||||||
|
@ -43,14 +37,19 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
|
||||||
* can be linked at most to 100 other objects
|
* can be linked at most to 100 other objects
|
||||||
*
|
*
|
||||||
* 2) JoinRelationEntityByTargetJob:
|
* 2) JoinRelationEntityByTargetJob:
|
||||||
* prepare tuples [source entity - relation - target entity] (S - R - T):
|
* (phase 1): prepare tuples [relation - target entity] (R - T):
|
||||||
* for each entity type E_i
|
* for each entity type E_i
|
||||||
* join (R.target = E_i.id),
|
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
|
||||||
* map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
|
* join (R.target = T_i.id)
|
||||||
* join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
|
* save the tuples (R_i, T_i)
|
||||||
|
* (phase 2):
|
||||||
|
* create the union of all the entity types E, hash by id
|
||||||
|
* read the tuples (R, T), hash by R.source
|
||||||
|
* join E.id = (R, T).source, where E becomes the Source Entity S
|
||||||
|
* save the tuples (S, R, T)
|
||||||
*
|
*
|
||||||
* 3) AdjacencyListBuilderJob:
|
* 3) AdjacencyListBuilderJob:
|
||||||
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
|
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
||||||
*
|
*
|
||||||
* 4) XmlConverterJob:
|
* 4) XmlConverterJob:
|
||||||
* convert the JoinedEntities as XML records
|
* convert the JoinedEntities as XML records
|
||||||
|
@ -59,7 +58,6 @@ public class AdjacencyListBuilderJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
|
private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
public static final int MAX_LINKS = 100;
|
public static final int MAX_LINKS = 100;
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
@ -91,7 +89,6 @@ public class AdjacencyListBuilderJob {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
createAdjacencyLists(spark, inputPath, outputPath);
|
createAdjacencyLists(spark, inputPath, outputPath);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
|
private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
@ -103,7 +100,7 @@ public class AdjacencyListBuilderJob {
|
||||||
.groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
|
.groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
|
.mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
|
||||||
JoinedEntity j = new JoinedEntity();
|
JoinedEntity j = new JoinedEntity();
|
||||||
Links links = new Links();
|
List<Tuple2> links = new ArrayList<>();
|
||||||
while (values.hasNext() && links.size() < MAX_LINKS) {
|
while (values.hasNext() && links.size() < MAX_LINKS) {
|
||||||
EntityRelEntity curr = values.next();
|
EntityRelEntity curr = values.next();
|
||||||
if (j.getEntity() == null) {
|
if (j.getEntity() == null) {
|
||||||
|
|
|
@ -4,9 +4,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
|
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
|
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -37,22 +37,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
|
||||||
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
||||||
* can be linked at most to 100 other objects
|
* can be linked at most to 100 other objects
|
||||||
*
|
*
|
||||||
* 2) CreateRelatedEntitiesJob_phase1:
|
* 2) JoinRelationEntityByTargetJob:
|
||||||
* prepare tuples [relation - target entity] (R - T):
|
* (phase 1): prepare tuples [relation - target entity] (R - T):
|
||||||
* for each entity type E_i
|
* for each entity type E_i
|
||||||
* join (R.target = E_i.id),
|
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
|
||||||
* map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
|
* join (R.target = T_i.id)
|
||||||
|
* save the tuples (R_i, T_i)
|
||||||
|
* (phase 2):
|
||||||
|
* create the union of all the entity types E, hash by id
|
||||||
|
* read the tuples (R, T), hash by R.source
|
||||||
|
* join E.id = (R, T).source, where E becomes the Source Entity S
|
||||||
|
* save the tuples (S, R, T)
|
||||||
*
|
*
|
||||||
* 3) CreateRelatedEntitiesJob_phase2:
|
* 3) AdjacencyListBuilderJob:
|
||||||
* prepare tuples [source entity - relation - target entity] (S - R - T):
|
|
||||||
* create the union of the each entity type, hash by id (S)
|
|
||||||
* for each [R - T_i] produced in phase1
|
|
||||||
* join S.id = [R - T_i].source to produce (S_i - R - T_i)
|
|
||||||
*
|
|
||||||
* 4) AdjacencyListBuilderJob:
|
|
||||||
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
||||||
*
|
*
|
||||||
* 5) XmlConverterJob:
|
* 4) XmlConverterJob:
|
||||||
* convert the JoinedEntities as XML records
|
* convert the JoinedEntities as XML records
|
||||||
*/
|
*/
|
||||||
public class CreateRelatedEntitiesJob_phase1 {
|
public class CreateRelatedEntitiesJob_phase1 {
|
||||||
|
@ -103,20 +103,21 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {
|
private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {
|
||||||
|
|
||||||
Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
|
Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
|
||||||
|
.filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
|
||||||
.map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
|
.map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)));
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)))
|
||||||
|
.cache();
|
||||||
|
|
||||||
Dataset<Tuple2<String, E>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
|
Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
|
||||||
.map((MapFunction<E, Tuple2<String, E>>) e -> new Tuple2<>(e.getId(), e),
|
.map((MapFunction<E, RelatedEntity>) value -> asRelatedEntity(value, entityClazz), Encoders.bean(RelatedEntity.class))
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(entityClazz)))
|
.map((MapFunction<RelatedEntity, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), e),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
|
||||||
.cache();
|
.cache();
|
||||||
|
|
||||||
relsByTarget
|
relsByTarget
|
||||||
.joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
|
.joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
|
||||||
.filter((FilterFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>>)
|
.map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, RelatedEntity>>, EntityRelEntity>)
|
||||||
value -> value._2()._2().getDataInfo().getDeletedbyinference() == false)
|
t -> new EntityRelEntity(t._1()._2(), t._2()._2()),
|
||||||
.map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>, EntityRelEntity>)
|
|
||||||
t -> new EntityRelEntity(t._1()._2(), GraphMappingUtils.asRelatedEntity(t._2()._2(), entityClazz)),
|
|
||||||
Encoders.bean(EntityRelEntity.class))
|
Encoders.bean(EntityRelEntity.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
|
@ -6,11 +6,9 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
|
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.TypedRow;
|
import eu.dnetlib.dhp.oa.provision.model.TypedRow;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -25,12 +23,10 @@ import scala.collection.JavaConverters;
|
||||||
import scala.collection.Seq;
|
import scala.collection.Seq;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
|
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
||||||
|
@ -45,24 +41,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
|
||||||
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
||||||
* can be linked at most to 100 other objects
|
* can be linked at most to 100 other objects
|
||||||
*
|
*
|
||||||
* 2) CreateRelatedEntitiesJob_phase1:
|
* 2) JoinRelationEntityByTargetJob:
|
||||||
* prepare tuples [relation - target entity] (R - T):
|
* (phase 1): prepare tuples [relation - target entity] (R - T):
|
||||||
* for each entity type E_i
|
* for each entity type E_i
|
||||||
* join (R.target = E_i.id),
|
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
|
||||||
* map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
|
* join (R.target = T_i.id)
|
||||||
* save the tuples [R - T_i] in append mode
|
* save the tuples (R_i, T_i)
|
||||||
|
* (phase 2):
|
||||||
|
* create the union of all the entity types E, hash by id
|
||||||
|
* read the tuples (R, T), hash by R.source
|
||||||
|
* join E.id = (R, T).source, where E becomes the Source Entity S
|
||||||
|
* save the tuples (S, R, T)
|
||||||
*
|
*
|
||||||
* 3) CreateRelatedEntitiesJob_phase2:
|
* 3) AdjacencyListBuilderJob:
|
||||||
* prepare tuples [source entity - relation - target entity] (S - R - T):
|
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
||||||
* create the union of the each entity type, hash by id (S)
|
|
||||||
* for each [R - T_i] produced in phase1
|
|
||||||
* join S.id = [R - T_i].source to produce (S_i - R - T_i)
|
|
||||||
* save in append mode
|
|
||||||
*
|
*
|
||||||
* 4) AdjacencyListBuilderJob:
|
* 4) XmlConverterJob:
|
||||||
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
|
|
||||||
*
|
|
||||||
* 5) XmlConverterJob:
|
|
||||||
* convert the JoinedEntities as XML records
|
* convert the JoinedEntities as XML records
|
||||||
*/
|
*/
|
||||||
public class CreateRelatedEntitiesJob_phase2 {
|
public class CreateRelatedEntitiesJob_phase2 {
|
||||||
|
|
|
@ -1,31 +1,22 @@
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Iterators;
|
import com.google.common.collect.Iterators;
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
|
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import scala.Tuple2;
|
|
||||||
import scala.math.Ordering;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
@ -44,14 +35,19 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
* can be linked at most to 100 other objects
|
* can be linked at most to 100 other objects
|
||||||
*
|
*
|
||||||
* 2) JoinRelationEntityByTargetJob:
|
* 2) JoinRelationEntityByTargetJob:
|
||||||
* prepare tuples [source entity - relation - target entity] (S - R - T):
|
* (phase 1): prepare tuples [relation - target entity] (R - T):
|
||||||
* for each entity type E_i
|
* for each entity type E_i
|
||||||
* join (R.target = E_i.id),
|
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
|
||||||
* map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
|
* join (R.target = T_i.id)
|
||||||
* join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
|
* save the tuples (R_i, T_i)
|
||||||
|
* (phase 2):
|
||||||
|
* create the union of all the entity types E, hash by id
|
||||||
|
* read the tuples (R, T), hash by R.source
|
||||||
|
* join E.id = (R, T).source, where E becomes the Source Entity S
|
||||||
|
* save the tuples (S, R, T)
|
||||||
*
|
*
|
||||||
* 3) AdjacencyListBuilderJob:
|
* 3) AdjacencyListBuilderJob:
|
||||||
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
|
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
||||||
*
|
*
|
||||||
* 4) XmlConverterJob:
|
* 4) XmlConverterJob:
|
||||||
* convert the JoinedEntities as XML records
|
* convert the JoinedEntities as XML records
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
@ -27,8 +28,11 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
@ -37,23 +41,25 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
|
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
|
||||||
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
|
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
|
||||||
*
|
*
|
||||||
* The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
|
|
||||||
* by E, finally grouped by E.id;
|
|
||||||
*
|
|
||||||
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
|
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
|
||||||
* 1) PrepareRelationsJob:
|
* 1) PrepareRelationsJob:
|
||||||
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
|
||||||
* can be linked at most to 100 other objects
|
* can be linked at most to 100 other objects
|
||||||
*
|
*
|
||||||
* 2) JoinRelationEntityByTargetJob:
|
* 2) JoinRelationEntityByTargetJob:
|
||||||
* prepare tuples [source entity - relation - target entity] (S - R - T):
|
* (phase 1): prepare tuples [relation - target entity] (R - T):
|
||||||
* for each entity type E_i
|
* for each entity type E_i
|
||||||
* join (R.target = E_i.id),
|
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
|
||||||
* map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
|
* join (R.target = T_i.id)
|
||||||
* join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
|
* save the tuples (R_i, T_i)
|
||||||
|
* (phase 2):
|
||||||
|
* create the union of all the entity types E, hash by id
|
||||||
|
* read the tuples (R, T), hash by R.source
|
||||||
|
* join E.id = (R, T).source, where E becomes the Source Entity S
|
||||||
|
* save the tuples (S, R, T)
|
||||||
*
|
*
|
||||||
* 3) AdjacencyListBuilderJob:
|
* 3) AdjacencyListBuilderJob:
|
||||||
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
|
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
|
||||||
*
|
*
|
||||||
* 4) XmlConverterJob:
|
* 4) XmlConverterJob:
|
||||||
* convert the JoinedEntities as XML records
|
* convert the JoinedEntities as XML records
|
||||||
|
@ -62,6 +68,8 @@ public class XmlConverterJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
|
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
@ -107,12 +115,31 @@ public class XmlConverterJob {
|
||||||
spark.read()
|
spark.read()
|
||||||
.load(inputPath)
|
.load(inputPath)
|
||||||
.as(Encoders.bean(JoinedEntity.class))
|
.as(Encoders.bean(JoinedEntity.class))
|
||||||
|
/* .map((MapFunction<JoinedEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
|
||||||
|
.write()
|
||||||
|
.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
|
||||||
|
.text("/tmp/json");
|
||||||
|
|
||||||
|
spark.read()
|
||||||
|
.textFile("/tmp/json")
|
||||||
|
.map((MapFunction<String, JoinedEntity>) value -> OBJECT_MAPPER.readValue(value, JoinedEntity.class), Encoders.bean(JoinedEntity.class))
|
||||||
|
.map((MapFunction<JoinedEntity, JoinedEntity>) j -> {
|
||||||
|
if (j.getLinks() != null) {
|
||||||
|
j.setLinks(j.getLinks()
|
||||||
|
.stream()
|
||||||
|
.filter(t -> t.getRelation() != null & t.getRelatedEntity() != null)
|
||||||
|
.collect(Collectors.toCollection(ArrayList::new)));
|
||||||
|
}
|
||||||
|
return j;
|
||||||
|
}, Encoders.bean(JoinedEntity.class))
|
||||||
|
|
||||||
|
*/
|
||||||
.map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
|
.map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
|
||||||
je.getEntity().getId(),
|
je.getEntity().getId(),
|
||||||
recordFactory.build(je)
|
recordFactory.build(je)
|
||||||
), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> t)
|
.mapToPair((PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
package eu.dnetlib.dhp.oa.provision.model;
|
package eu.dnetlib.dhp.oa.provision.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class JoinedEntity implements Serializable {
|
public class JoinedEntity implements Serializable {
|
||||||
|
|
||||||
private TypedRow entity;
|
private TypedRow entity;
|
||||||
|
|
||||||
private Links links;
|
private List<Tuple2> links;
|
||||||
|
|
||||||
public JoinedEntity() {
|
public JoinedEntity() {
|
||||||
}
|
}
|
||||||
|
@ -19,11 +20,11 @@ public class JoinedEntity implements Serializable {
|
||||||
this.entity = entity;
|
this.entity = entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Links getLinks() {
|
public List<Tuple2> getLinks() {
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLinks(Links links) {
|
public void setLinks(List<Tuple2> links) {
|
||||||
this.links = links;
|
this.links = links;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
package eu.dnetlib.dhp.oa.provision.model;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
public class Links extends HashSet<Tuple2> implements Serializable {
|
|
||||||
|
|
||||||
public Links() {
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -114,7 +114,7 @@ public class GraphMappingUtils {
|
||||||
|
|
||||||
final RelatedEntity re = new RelatedEntity();
|
final RelatedEntity re = new RelatedEntity();
|
||||||
re.setId(entity.getId());
|
re.setId(entity.getId());
|
||||||
re.setType(clazz.getName());
|
re.setType(EntityType.fromClass(clazz).name());
|
||||||
|
|
||||||
re.setPid(entity.getPid());
|
re.setPid(entity.getPid());
|
||||||
re.setCollectedfrom(entity.getCollectedfrom());
|
re.setCollectedfrom(entity.getCollectedfrom());
|
||||||
|
@ -125,16 +125,16 @@ public class GraphMappingUtils {
|
||||||
case otherresearchproduct:
|
case otherresearchproduct:
|
||||||
case software:
|
case software:
|
||||||
|
|
||||||
Result r = (Result) entity;
|
Result result = (Result) entity;
|
||||||
|
|
||||||
if (r.getTitle() == null && !r.getTitle().isEmpty()) {
|
if (result.getTitle() == null && !result.getTitle().isEmpty()) {
|
||||||
re.setTitle(r.getTitle().stream().findFirst().get());
|
re.setTitle(result.getTitle().stream().findFirst().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
re.setDateofacceptance(getValue(r.getDateofacceptance()));
|
re.setDateofacceptance(getValue(result.getDateofacceptance()));
|
||||||
re.setPublisher(getValue(r.getPublisher()));
|
re.setPublisher(getValue(result.getPublisher()));
|
||||||
re.setResulttype(re.getResulttype());
|
re.setResulttype(result.getResulttype());
|
||||||
re.setInstances(re.getInstances());
|
re.setInstances(result.getInstance());
|
||||||
|
|
||||||
//TODO still to be mapped
|
//TODO still to be mapped
|
||||||
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
|
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
|
||||||
|
|
|
@ -694,7 +694,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
if (isNotBlank(re.getCodeRepositoryUrl())) {
|
if (isNotBlank(re.getCodeRepositoryUrl())) {
|
||||||
metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
||||||
}
|
}
|
||||||
if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
|
if (re.getResulttype() != null & re.getResulttype().isBlank()) {
|
||||||
metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
|
metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
|
||||||
}
|
}
|
||||||
if (re.getCollectedfrom() != null) {
|
if (re.getCollectedfrom() != null) {
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
<switch>
|
<switch>
|
||||||
<case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
|
<case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
|
||||||
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
|
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
|
||||||
<default to="adjancency_lists"/>
|
<default to="prepare_relations"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
|
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
|
||||||
|
@ -324,7 +324,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
|
<arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
|
||||||
|
@ -351,7 +351,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
|
<arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
|
||||||
|
@ -365,7 +365,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>build_adjacency_lists</name>
|
<name>convert_to_xml</name>
|
||||||
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
|
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
|
Loading…
Reference in New Issue