forked from antonis.lempesis/dnet-hadoop
WIP: logic for promoting action sets added
This commit is contained in:
parent
bea1a94346
commit
958f0693d6
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.0.5-SNAPSHOT</version>
|
<version>1.1.6-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-actionmanager</artifactId>
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
|
||||||
|
@ -52,11 +52,34 @@
|
||||||
<version>2.25.0</version>
|
<version>2.25.0</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-schemas</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
<version>1.0.5-SNAPSHOT</version>
|
<version>${project.version}</version>
|
||||||
<scope>compile</scope>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-actionmanager-common</artifactId>
|
||||||
|
<version>[6.0.0, 7.0.0)</version>
|
||||||
|
<exclusions>
|
||||||
|
<!-- duplicate with different groupId -->
|
||||||
|
<exclusion>
|
||||||
|
<groupId>apache</groupId>
|
||||||
|
<artifactId>commons-logging</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||||
|
<version>[3.0.0, 4.0.0)</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
|
import org.apache.spark.sql.Column;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
|
|
||||||
|
public class PromoteActionSetFromHDFSFunctions {
|
||||||
|
|
||||||
|
public static <T extends OafEntity> Dataset<T> groupEntitiesByIdAndMerge(Dataset<T> entityDS,
|
||||||
|
Class<T> clazz) {
|
||||||
|
return entityDS
|
||||||
|
.groupByKey((MapFunction<T, String>) OafEntity::getId, Encoders.STRING())
|
||||||
|
.reduceGroups((ReduceFunction<T>) (x1, x2) -> {
|
||||||
|
x1.mergeFrom(x2);
|
||||||
|
return x1;
|
||||||
|
})
|
||||||
|
.map((MapFunction<Tuple2<String, T>, T>) pair -> pair._2, Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends OafEntity, S> Dataset<T> joinEntitiesWithActionPayloadAndMerge(Dataset<T> entityDS,
|
||||||
|
Dataset<S> actionPayloadDS,
|
||||||
|
BiFunction<Dataset<T>, Dataset<S>, Column> entityToActionPayloadJoinExpr,
|
||||||
|
BiFunction<S, Class<T>, T> actionPayloadToEntityFn,
|
||||||
|
Class<T> clazz) {
|
||||||
|
return entityDS
|
||||||
|
.joinWith(actionPayloadDS, entityToActionPayloadJoinExpr.apply(entityDS, actionPayloadDS), "left_outer")
|
||||||
|
.map((MapFunction<Tuple2<T, S>, T>) pair -> Optional
|
||||||
|
.ofNullable(pair._2())
|
||||||
|
.map(x -> {
|
||||||
|
T entity = actionPayloadToEntityFn.apply(x, clazz);
|
||||||
|
pair._1().mergeFrom(entity);
|
||||||
|
return pair._1();
|
||||||
|
})
|
||||||
|
.orElse(pair._1()), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -1,24 +1,24 @@
|
||||||
package eu.dnetlib.dhp.actionmanager;
|
package eu.dnetlib.dhp.actionmanager;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.JsonNode;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.data.proto.OafProtos;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.types.*;
|
import org.apache.spark.sql.types.*;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.actionmanager.PromoteActionSetFromHDFSFunctions.*;
|
||||||
import static org.apache.spark.sql.functions.*;
|
import static org.apache.spark.sql.functions.*;
|
||||||
|
|
||||||
public class PromoteActionSetFromHDFSJob {
|
public class PromoteActionSetFromHDFSJob {
|
||||||
|
@ -28,73 +28,209 @@ public class PromoteActionSetFromHDFSJob {
|
||||||
PromoteActionSetFromHDFSJob.class
|
PromoteActionSetFromHDFSJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
String inputActionSetPath = parser.get("input");
|
|
||||||
String outputPath = parser.get("output");
|
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
String inputGraphPath = parser.get("inputGraphPath");
|
||||||
|
List<String> inputActionSetPaths = Arrays.asList(parser.get("inputActionSetPaths").split(","));
|
||||||
|
String outputGraphPath = parser.get("outputGraphPath");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
conf.setMaster(parser.get("master"));
|
conf.setMaster(parser.get("master"));
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
|
|
||||||
try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) {
|
try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) {
|
||||||
// reading actions as RDD
|
|
||||||
JavaRDD<Row> actionsRDD = JavaSparkContext
|
|
||||||
.fromSparkContext(spark.sparkContext())
|
|
||||||
.sequenceFile(inputActionSetPath, Text.class, Text.class)
|
|
||||||
.map(x -> RowFactory.create(x._2().toString()));
|
|
||||||
|
|
||||||
// converting actions to DataFrame and deserializing content of TargetValue
|
// ----- READ -----
|
||||||
// using unbase64 on TargetValue content to get String representation
|
// dataset
|
||||||
StructType rowSchema = StructType$.MODULE$.apply(
|
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasetDS = readGraphTable(
|
||||||
Collections.singletonList(
|
spark, String.format("%s/dataset", inputGraphPath), eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||||
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
|
|
||||||
));
|
|
||||||
Dataset<Row> deserializedTargetValue = spark.createDataFrame(actionsRDD, rowSchema)
|
|
||||||
.withColumn("TargetValue", get_json_object(col("value"), "$.TargetValue"))
|
|
||||||
.select(unbase64(col("TargetValue")).cast(DataTypes.StringType).as("target_value_json"))
|
|
||||||
.cache();
|
.cache();
|
||||||
|
datasetDS.printSchema();
|
||||||
|
datasetDS.show();
|
||||||
|
|
||||||
// printing: only for testing
|
// datasource
|
||||||
deserializedTargetValue.printSchema();
|
Dataset<Datasource> datasourceDS =
|
||||||
deserializedTargetValue.show();
|
readGraphTable(spark, String.format("%s/datasource", inputGraphPath), Datasource.class)
|
||||||
System.out.println(deserializedTargetValue.first().toString());
|
.cache();
|
||||||
|
datasourceDS.printSchema();
|
||||||
|
datasourceDS.show();
|
||||||
|
|
||||||
// grouping and merging: should be generic
|
// organization
|
||||||
Dataset<Software> softwareDS = deserializedTargetValue
|
Dataset<Organization> organizationDS =
|
||||||
.map((MapFunction<Row, Software>) PromoteActionSetFromHDFSJob::rowToOafEntity, Encoders.kryo(Software.class))
|
readGraphTable(spark, String.format("%s/organization", inputGraphPath), Organization.class)
|
||||||
.groupByKey((MapFunction<Software, String>) OafEntity::getId, Encoders.STRING())
|
.cache();
|
||||||
.reduceGroups((ReduceFunction<Software>) (software1, software2) -> {
|
organizationDS.printSchema();
|
||||||
software1.mergeFrom(software2);
|
organizationDS.show();
|
||||||
return software1;
|
|
||||||
})
|
|
||||||
.map((MapFunction<Tuple2<String, Software>, Software>) pair -> pair._2, Encoders.kryo(Software.class));
|
|
||||||
|
|
||||||
|
// otherresearchproduct
|
||||||
|
Dataset<OtherResearchProduct> otherResearchProductDS =
|
||||||
|
readGraphTable(spark, String.format("%s/otherresearchproduct", inputGraphPath), OtherResearchProduct.class)
|
||||||
|
.cache();
|
||||||
|
otherResearchProductDS.printSchema();
|
||||||
|
otherResearchProductDS.show();
|
||||||
|
|
||||||
|
// project
|
||||||
|
Dataset<Project> projectDS =
|
||||||
|
readGraphTable(spark, String.format("%s/project", inputGraphPath), Project.class)
|
||||||
|
.cache();
|
||||||
|
projectDS.printSchema();
|
||||||
|
projectDS.show();
|
||||||
|
|
||||||
|
// publication
|
||||||
|
Dataset<Publication> publicationDS =
|
||||||
|
readGraphTable(spark, String.format("%s/publication", inputGraphPath), Publication.class)
|
||||||
|
.cache();
|
||||||
|
publicationDS.printSchema();
|
||||||
|
publicationDS.show();
|
||||||
|
|
||||||
|
// relation
|
||||||
|
Dataset<Relation> relationDS =
|
||||||
|
readGraphTable(spark, String.format("%s/relation", inputGraphPath), Relation.class)
|
||||||
|
.cache();
|
||||||
|
relationDS.printSchema();
|
||||||
|
relationDS.show();
|
||||||
|
|
||||||
|
// software
|
||||||
|
Dataset<Software> softwareDS =
|
||||||
|
readGraphTable(spark, String.format("%s/software", inputGraphPath), Software.class)
|
||||||
|
.cache();
|
||||||
softwareDS.printSchema();
|
softwareDS.printSchema();
|
||||||
softwareDS.show();
|
softwareDS.show();
|
||||||
|
|
||||||
// save
|
// actions
|
||||||
// softwareDS.toDF()
|
Dataset<String> actionPayloadDS = inputActionSetPaths.stream()
|
||||||
// .write()
|
.map(inputActionSetPath -> readActionSetPayload(spark, inputActionSetPath))
|
||||||
// .partitionBy("id")
|
.reduce(Dataset::union)
|
||||||
// .save(outputPath);
|
.get()
|
||||||
|
.cache();
|
||||||
|
actionPayloadDS.printSchema();
|
||||||
|
actionPayloadDS.show();
|
||||||
|
System.out.println(String.join("\n", actionPayloadDS.takeAsList(20)));
|
||||||
|
|
||||||
// another approach: using only DataFrames i.e. DataSet<Row>, not DataSets<Software>
|
Dataset<String> relationActionPayloadDS = filterActionPayloadForRelations(actionPayloadDS)
|
||||||
|
.cache();
|
||||||
|
relationActionPayloadDS.printSchema();
|
||||||
|
relationActionPayloadDS.show();
|
||||||
|
|
||||||
|
Dataset<String> entityActionPayloadDS = filterActionPayloadForEntity(actionPayloadDS)
|
||||||
|
.cache();
|
||||||
|
entityActionPayloadDS.printSchema();
|
||||||
|
entityActionPayloadDS.show();
|
||||||
|
|
||||||
|
// ----- LOGIC -----
|
||||||
|
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> processedDatasetDS =
|
||||||
|
processEntityDS(datasetDS, entityActionPayloadDS, eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||||
|
Dataset<Datasource> processedDatasourceDS =
|
||||||
|
processEntityDS(datasourceDS, entityActionPayloadDS, Datasource.class);
|
||||||
|
Dataset<Organization> processedOrganizationDS =
|
||||||
|
processEntityDS(organizationDS, entityActionPayloadDS, Organization.class);
|
||||||
|
Dataset<OtherResearchProduct> processedOtherResearchProductDS =
|
||||||
|
processEntityDS(otherResearchProductDS, entityActionPayloadDS, OtherResearchProduct.class);
|
||||||
|
Dataset<Project> processedProjectDS =
|
||||||
|
processEntityDS(projectDS, entityActionPayloadDS, Project.class);
|
||||||
|
Dataset<Publication> processedPublicationDS =
|
||||||
|
processEntityDS(publicationDS, entityActionPayloadDS, Publication.class);
|
||||||
|
Dataset<Relation> processedRelationDS =
|
||||||
|
processRelationDS(relationDS, relationActionPayloadDS);
|
||||||
|
Dataset<Software> processedSoftwareDS =
|
||||||
|
processEntityDS(softwareDS, entityActionPayloadDS, Software.class);
|
||||||
|
|
||||||
|
// ----- SAVE -----
|
||||||
|
processedDatasetDS.write()
|
||||||
|
.save(String.format("%s/dataset", outputGraphPath));
|
||||||
|
processedDatasourceDS.write()
|
||||||
|
.save(String.format("%s/datasource", outputGraphPath));
|
||||||
|
processedOrganizationDS.write()
|
||||||
|
.save(String.format("%s/organization", outputGraphPath));
|
||||||
|
processedOtherResearchProductDS.write()
|
||||||
|
.save(String.format("%s/otherresearchproduct", outputGraphPath));
|
||||||
|
processedProjectDS.write()
|
||||||
|
.save(String.format("%s/project", outputGraphPath));
|
||||||
|
processedPublicationDS.write()
|
||||||
|
.save(String.format("%s/publication", outputGraphPath));
|
||||||
|
processedRelationDS.write()
|
||||||
|
.save(String.format("%s/relation", outputGraphPath));
|
||||||
|
processedSoftwareDS.write()
|
||||||
|
.save(String.format("%s/software", outputGraphPath));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Software rowToOafEntity(Row row) {
|
private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
|
||||||
// converts row with JSON into Software object: should be generic
|
Arrays.asList(
|
||||||
// currently extracts only "entity.id" field from JSON
|
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
|
||||||
ObjectMapper objectMapper = new ObjectMapper();
|
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
|
||||||
|
));
|
||||||
|
|
||||||
|
private static <T extends Oaf> Dataset<T> readGraphTable(SparkSession spark, String path, Class<T> clazz) {
|
||||||
|
JavaRDD<Row> rows = JavaSparkContext
|
||||||
|
.fromSparkContext(spark.sparkContext())
|
||||||
|
.sequenceFile(path, Text.class, Text.class)
|
||||||
|
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
|
||||||
|
|
||||||
|
return spark.createDataFrame(rows, KV_SCHEMA)
|
||||||
|
.map((MapFunction<Row, T>) row -> new ObjectMapper().readValue(row.<String>getAs("value"), clazz),
|
||||||
|
Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<String> readActionSetPayload(SparkSession spark, String inputActionSetPath) {
|
||||||
|
JavaRDD<Row> actionsRDD = JavaSparkContext
|
||||||
|
.fromSparkContext(spark.sparkContext())
|
||||||
|
.sequenceFile(inputActionSetPath, Text.class, Text.class)
|
||||||
|
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
|
||||||
|
|
||||||
|
return spark.createDataFrame(actionsRDD, KV_SCHEMA)
|
||||||
|
.select(unbase64(get_json_object(col("value"), "$.TargetValue"))
|
||||||
|
.cast(DataTypes.StringType).as("target_value_json"))
|
||||||
|
.as(Encoders.STRING());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<String> filterActionPayloadForRelations(Dataset<String> actionPayloadDS) {
|
||||||
|
return actionPayloadDS
|
||||||
|
.where(get_json_object(col("target_value_json"), "$.kind").equalTo("relation"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<String> filterActionPayloadForEntity(Dataset<String> actionPayloadDS) {
|
||||||
|
return actionPayloadDS
|
||||||
|
.where(get_json_object(col("target_value_json"), "$.kind").equalTo("entity"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static <T extends OafEntity> Dataset<T> processEntityDS(Dataset<T> entityDS,
|
||||||
|
Dataset<String> actionPayloadDS,
|
||||||
|
Class<T> clazz) {
|
||||||
|
Dataset<T> groupedAndMerged = groupEntitiesByIdAndMerge(entityDS, clazz);
|
||||||
|
Dataset<T> joinedAndMerged = joinEntitiesWithActionPayloadAndMerge(groupedAndMerged,
|
||||||
|
actionPayloadDS,
|
||||||
|
PromoteActionSetFromHDFSJob::entityToActionPayloadJoinExpr,
|
||||||
|
PromoteActionSetFromHDFSJob::actionPayloadToEntity,
|
||||||
|
clazz);
|
||||||
|
return groupEntitiesByIdAndMerge(joinedAndMerged, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends OafEntity> Column entityToActionPayloadJoinExpr(Dataset<T> left,
|
||||||
|
Dataset<String> right) {
|
||||||
|
return left.col("id").equalTo(
|
||||||
|
get_json_object(right.col("target_value_json"), "$.entity.id"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends OafEntity> T actionPayloadToEntity(String actionPayload,
|
||||||
|
Class<T> clazz) {
|
||||||
try {
|
try {
|
||||||
JsonNode jsonNode = objectMapper.readTree(row.getString(0));
|
OafProtos.Oaf oldEntity = new ObjectMapper().readValue(actionPayload, OafProtos.Oaf.class);
|
||||||
String id = jsonNode.at("/entity/id").asText();
|
return entityOldToNew(oldEntity, clazz);
|
||||||
Software software = new Software();
|
|
||||||
software.setId(id);
|
|
||||||
return software;
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO
|
||||||
|
private static <T extends OafEntity> T entityOldToNew(OafProtos.Oaf old,
|
||||||
|
Class<T> clazz) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO
|
||||||
|
private static Dataset<Relation> processRelationDS(Dataset<Relation> relationDS,
|
||||||
|
Dataset<String> actionPayloadDS) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,15 +6,21 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "i",
|
"paramName": "ig",
|
||||||
"paramLongName": "input",
|
"paramLongName": "inputGraphPath",
|
||||||
"paramDescription": "the path of the input sequential file to read",
|
"paramDescription": "#TODO: input graph path",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "ia",
|
||||||
"paramLongName": "output",
|
"paramLongName": "inputActionSetPaths",
|
||||||
"paramDescription": "the path of the result DataFrame on HDFS",
|
"paramDescription": "#TODO: comma separated list of paths to input action sets",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "og",
|
||||||
|
"paramLongName": "outputGraphPath",
|
||||||
|
"paramDescription": "#TODO: the path of the result DataFrame on HDFS",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -0,0 +1,169 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.Column;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.get_json_object;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() {
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setMaster("local");
|
||||||
|
conf.setAppName(PromoteActionSetFromHDFSFunctionsTest.class.getSimpleName());
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() {
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldGroupOafEntitiesByIdAndMergeWithinGroup() {
|
||||||
|
// given
|
||||||
|
String id1 = "id1";
|
||||||
|
String id2 = "id2";
|
||||||
|
String id3 = "id3";
|
||||||
|
List<OafEntityImpl> entityData = Arrays.asList(
|
||||||
|
createOafEntityImpl(id1),
|
||||||
|
createOafEntityImpl(id2), createOafEntityImpl(id2),
|
||||||
|
createOafEntityImpl(id3), createOafEntityImpl(id3), createOafEntityImpl(id3)
|
||||||
|
);
|
||||||
|
Dataset<OafEntityImpl> entityDS = spark.createDataset(entityData, Encoders.bean(OafEntityImpl.class));
|
||||||
|
|
||||||
|
// when
|
||||||
|
List<OafEntityImpl> results = PromoteActionSetFromHDFSFunctions
|
||||||
|
.groupEntitiesByIdAndMerge(entityDS, OafEntityImpl.class)
|
||||||
|
.collectAsList();
|
||||||
|
System.out.println(results.stream().map(x -> String.format("%s:%d", x.getId(), x.merged)).collect(Collectors.joining(",")));
|
||||||
|
|
||||||
|
// then
|
||||||
|
assertEquals(3, results.size());
|
||||||
|
results.forEach(result -> {
|
||||||
|
switch (result.getId()) {
|
||||||
|
case "id1":
|
||||||
|
assertEquals(1, result.merged);
|
||||||
|
break;
|
||||||
|
case "id2":
|
||||||
|
assertEquals(2, result.merged);
|
||||||
|
break;
|
||||||
|
case "id3":
|
||||||
|
assertEquals(3, result.merged);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldJoinWithActionPayloadUsingIdAndMerge() {
|
||||||
|
// given
|
||||||
|
String id1 = "id1";
|
||||||
|
String id2 = "id2";
|
||||||
|
String id3 = "id3";
|
||||||
|
String id4 = "id4";
|
||||||
|
List<OafEntityImpl> entityData = Arrays.asList(
|
||||||
|
createOafEntityImpl(id1), createOafEntityImpl(id2), createOafEntityImpl(id3), createOafEntityImpl(id4)
|
||||||
|
);
|
||||||
|
Dataset<OafEntityImpl> entityDS = spark.createDataset(entityData, Encoders.bean(OafEntityImpl.class));
|
||||||
|
|
||||||
|
List<String> actionPayloadData = Arrays.asList(
|
||||||
|
actionPayload(id1),
|
||||||
|
actionPayload(id2), actionPayload(id2),
|
||||||
|
actionPayload(id3), actionPayload(id3), actionPayload(id3)
|
||||||
|
);
|
||||||
|
Dataset<String> actionPayloadDS = spark.createDataset(actionPayloadData, Encoders.STRING());
|
||||||
|
|
||||||
|
BiFunction<Dataset<OafEntityImpl>, Dataset<String>, Column> entityToActionPayloadJoinExpr = (left, right) ->
|
||||||
|
left.col("id").equalTo(get_json_object(right.col("value"), "$.id"));
|
||||||
|
BiFunction<String, Class<OafEntityImpl>, OafEntityImpl> actionPayloadToEntityFn =
|
||||||
|
(BiFunction<String, Class<OafEntityImpl>, OafEntityImpl> & Serializable) (s, clazz) -> {
|
||||||
|
try {
|
||||||
|
JsonNode jsonNode = new ObjectMapper().readTree(s);
|
||||||
|
String id = jsonNode.at("/id").asText();
|
||||||
|
OafEntityImpl x = new OafEntityImpl();
|
||||||
|
x.setId(id);
|
||||||
|
return x;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// when
|
||||||
|
List<OafEntityImpl> results = PromoteActionSetFromHDFSFunctions
|
||||||
|
.joinEntitiesWithActionPayloadAndMerge(entityDS,
|
||||||
|
actionPayloadDS,
|
||||||
|
entityToActionPayloadJoinExpr,
|
||||||
|
actionPayloadToEntityFn,
|
||||||
|
OafEntityImpl.class)
|
||||||
|
.collectAsList();
|
||||||
|
System.out.println(results.stream().map(x -> String.format("%s:%d", x.getId(), x.merged)).collect(Collectors.joining(",")));
|
||||||
|
|
||||||
|
// then
|
||||||
|
assertEquals(7, results.size());
|
||||||
|
results.forEach(result -> {
|
||||||
|
switch (result.getId()) {
|
||||||
|
case "id1":
|
||||||
|
assertEquals(2, result.merged);
|
||||||
|
break;
|
||||||
|
case "id2":
|
||||||
|
assertEquals(2, result.merged);
|
||||||
|
break;
|
||||||
|
case "id3":
|
||||||
|
assertEquals(2, result.merged);
|
||||||
|
break;
|
||||||
|
case "id4":
|
||||||
|
assertEquals(1, result.merged);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class OafEntityImpl extends OafEntity {
|
||||||
|
private int merged = 1;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
merged += ((OafEntityImpl) e).merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMerged() {
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMerged(int merged) {
|
||||||
|
this.merged = merged;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static OafEntityImpl createOafEntityImpl(String id) {
|
||||||
|
OafEntityImpl x = new OafEntityImpl();
|
||||||
|
x.setId(id);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String actionPayload(String id) {
|
||||||
|
return String.format("{\"id\":\"%s\"}", id);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,5 @@
|
||||||
package eu.dnetlib.dhp.actionmanager;
|
package eu.dnetlib.dhp.actionmanager;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.junit.After;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -14,42 +12,64 @@ import java.util.Objects;
|
||||||
public class PromoteActionSetFromHDFSJobTest {
|
public class PromoteActionSetFromHDFSJobTest {
|
||||||
private ClassLoader cl = getClass().getClassLoader();
|
private ClassLoader cl = getClass().getClassLoader();
|
||||||
private Path workingDir;
|
private Path workingDir;
|
||||||
private Path inputActionSetDir;
|
private Path inputDir;
|
||||||
private Path outputDir;
|
private Path outputDir;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void before() throws IOException {
|
public void before() throws IOException {
|
||||||
workingDir = Files.createTempDirectory("promote_action_set");
|
workingDir = Files.createTempDirectory("promote_action_set");
|
||||||
inputActionSetDir = workingDir.resolve("input");
|
inputDir = workingDir.resolve("input");
|
||||||
outputDir = workingDir.resolve("output");
|
outputDir = workingDir.resolve("output");
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
// @After
|
||||||
public void after() throws IOException {
|
// public void after() throws IOException {
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
// FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
}
|
// }
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shouldReadAtomicActionsFromHDFSAndWritePartitionedAsParquetFiles() throws Exception {
|
public void shouldReadAtomicActionsFromHDFSAndWritePartitionedAsParquetFiles() throws Exception {
|
||||||
// given
|
// given
|
||||||
// NOTE: test resource should contain atomic actions in a human readable form, probably as json files; here the
|
// NOTE: test resource should contain atomic actions in a human readable form, probably as json files; here the
|
||||||
// files should be converted to a serialized format and written out to workingDir/input
|
// files should be converted to a serialized format and written out to workingDir/input
|
||||||
// for current testing: actions from software export, given as sequence file are copied to workingDir/input/
|
// for current testing: actions from iis export, given as sequence file are copied to workingDir/input/
|
||||||
Path exportedActionSetDir = Paths.get(Objects.requireNonNull(cl.getResource("entities/entities_software")).getFile());
|
|
||||||
Path inputDir = inputActionSetDir.resolve("entities_software");
|
//graph
|
||||||
Files.createDirectories(inputDir);
|
Path inputGraphDir = inputDir.resolve("graph");
|
||||||
copyFiles(exportedActionSetDir, inputDir);
|
Files.createDirectories(inputGraphDir);
|
||||||
|
copyFiles(Paths.get(Objects.requireNonNull(cl.getResource("graph")).getFile()), inputGraphDir);
|
||||||
|
|
||||||
|
//actions
|
||||||
|
Path inputActionsDir = inputDir.resolve("actions");
|
||||||
|
Files.createDirectories(inputActionsDir);
|
||||||
|
|
||||||
|
Path inputEntitiesPatentDir = inputActionsDir.resolve("entities_patent");
|
||||||
|
Files.createDirectories(inputEntitiesPatentDir);
|
||||||
|
copyFiles(Paths.get(Objects.requireNonNull(cl.getResource("actions/entities_patent")).getFile()), inputEntitiesPatentDir);
|
||||||
|
|
||||||
|
Path inputEntitiesSoftwareDir = inputActionsDir.resolve("entities_software");
|
||||||
|
Files.createDirectories(inputEntitiesSoftwareDir);
|
||||||
|
copyFiles(Paths.get(Objects.requireNonNull(cl.getResource("actions/entities_software")).getFile()), inputEntitiesSoftwareDir);
|
||||||
|
|
||||||
|
String inputActionSetPaths = String.join(",", inputEntitiesSoftwareDir.toString()); //inputEntitiesPatentDir.toString(),
|
||||||
|
|
||||||
PromoteActionSetFromHDFSJob.main(new String[]{
|
PromoteActionSetFromHDFSJob.main(new String[]{
|
||||||
"-mt", "local[*]",
|
"-master", "local[*]",
|
||||||
"-i", inputDir.toString(),
|
"-inputGraphPath", inputGraphDir.toString(),
|
||||||
"-o", outputDir.toString()
|
"-inputActionSetPaths", inputActionSetPaths,
|
||||||
|
"-outputGraphPath", outputDir.toString()
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void copyFiles(Path source, Path target) throws IOException {
|
private static void copyFiles(Path source, Path target) throws IOException {
|
||||||
Files.list(source).forEach(f -> {
|
Files.list(source).forEach(f -> {
|
||||||
try {
|
try {
|
||||||
Files.copy(f, target.resolve(f.getFileName()));
|
if (Files.isDirectory(f)) {
|
||||||
|
Path subTarget = Files.createDirectories(target.resolve(f.getFileName()));
|
||||||
|
copyFiles(f, subTarget);
|
||||||
|
} else {
|
||||||
|
Files.copy(f, target.resolve(f.getFileName()));
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|
Loading…
Reference in New Issue