diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml new file mode 100644 index 000000000..be76db755 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -0,0 +1,62 @@ + + + 4.0.0 + + eu.dnetlib.dhp + dhp-workflows + 1.0.5-SNAPSHOT + + dhp-actionmanager + + + + org.apache.spark + spark-core_2.11 + + + + org.apache.spark + spark-sql_2.11 + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + + net.sf.saxon + Saxon-HE + + + + dom4j + dom4j + + + + xml-apis + xml-apis + + + + jaxen + jaxen + + + + org.mockito + mockito-core + 2.25.0 + test + + + eu.dnetlib.dhp + dhp-schemas + 1.0.5-SNAPSHOT + compile + + + diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJob.java new file mode 100644 index 000000000..e21dd2ace --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJob.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dhp.actionmanager; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Software; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.ReduceFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.*; +import scala.Tuple2; + +import java.io.IOException; +import java.util.Collections; + +import static org.apache.spark.sql.functions.*; + +public class PromoteActionSetFromHDFSJob { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString( + PromoteActionSetFromHDFSJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json"))); + parser.parseArgument(args); + String inputActionSetPath = parser.get("input"); + String outputPath = parser.get("output"); + + final SparkConf conf = new SparkConf(); + conf.setMaster(parser.get("master")); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + + try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + // reading actions as RDD + JavaRDD actionsRDD = JavaSparkContext + .fromSparkContext(spark.sparkContext()) + .sequenceFile(inputActionSetPath, Text.class, Text.class) + .map(x -> RowFactory.create(x._2().toString())); + + // converting actions to DataFrame and deserializing content of TargetValue + // using unbase64 on TargetValue content to get String representation + StructType rowSchema = StructType$.MODULE$.apply( + Collections.singletonList( + StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()) + )); + Dataset deserializedTargetValue = spark.createDataFrame(actionsRDD, rowSchema) + .withColumn("TargetValue", get_json_object(col("value"), "$.TargetValue")) + .select(unbase64(col("TargetValue")).cast(DataTypes.StringType).as("target_value_json")) + .cache(); + + // printing: only for testing + deserializedTargetValue.printSchema(); + deserializedTargetValue.show(); + System.out.println(deserializedTargetValue.first().toString()); + + // grouping and merging: should be generic + Dataset softwareDS = deserializedTargetValue + .map((MapFunction) PromoteActionSetFromHDFSJob::rowToOafEntity, Encoders.kryo(Software.class)) + .groupByKey((MapFunction) OafEntity::getId, Encoders.STRING()) + .reduceGroups((ReduceFunction) (software1, software2) -> { + software1.mergeFrom(software2); + return software1; + }) + .map((MapFunction, Software>) pair -> pair._2, Encoders.kryo(Software.class)); + + softwareDS.printSchema(); + softwareDS.show(); + + // save +// softwareDS.toDF() +// .write() +// .partitionBy("id") +// .save(outputPath); + + // another approach: using only DataFrames i.e. DataSet, not DataSets + } + } + + private static Software rowToOafEntity(Row row) { + // converts row with JSON into Software object: should be generic + // currently extracts only "entity.id" field from JSON + ObjectMapper objectMapper = new ObjectMapper(); + try { + JsonNode jsonNode = objectMapper.readTree(row.getString(0)); + String id = jsonNode.at("/entity/id").asText(); + Software software = new Software(); + software.setId(id); + return software; + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json new file mode 100644 index 000000000..3b95c90d3 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "input", + "paramDescription": "the path of the input sequential file to read", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "output", + "paramDescription": "the path of the result DataFrame on HDFS", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJobTest.java new file mode 100644 index 000000000..3020d7b31 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/PromoteActionSetFromHDFSJobTest.java @@ -0,0 +1,59 @@ +package eu.dnetlib.dhp.actionmanager; + +import org.apache.commons.io.FileUtils; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Objects; + +public class PromoteActionSetFromHDFSJobTest { + private ClassLoader cl = getClass().getClassLoader(); + private Path workingDir; + private Path inputActionSetDir; + private Path outputDir; + + @Before + public void before() throws IOException { + workingDir = Files.createTempDirectory("promote_action_set"); + inputActionSetDir = workingDir.resolve("input"); + outputDir = workingDir.resolve("output"); + } + + @After + public void after() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + } + + @Test + public void shouldReadAtomicActionsFromHDFSAndWritePartitionedAsParquetFiles() throws Exception { + // given + // NOTE: test resource should contain atomic actions in a human readable form, probably as json files; here the + // files should be converted to a serialized format and written out to workingDir/input + // for current testing: actions from software export, given as sequence file are copied to workingDir/input/ + Path exportedActionSetDir = Paths.get(Objects.requireNonNull(cl.getResource("entities/entities_software")).getFile()); + Path inputDir = inputActionSetDir.resolve("entities_software"); + Files.createDirectories(inputDir); + copyFiles(exportedActionSetDir, inputDir); + PromoteActionSetFromHDFSJob.main(new String[]{ + "-mt", "local[*]", + "-i", inputDir.toString(), + "-o", outputDir.toString() + }); + } + + private static void copyFiles(Path source, Path target) throws IOException { + Files.list(source).forEach(f -> { + try { + Files.copy(f, target.resolve(f.getFileName())); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/_SUCCESS b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00000 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00000 new file mode 100644 index 000000000..a5bfdac14 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00000 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00001 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00001 new file mode 100644 index 000000000..1af1d3a5e Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00001 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00002 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00002 new file mode 100644 index 000000000..92e046069 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00002 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00003 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00003 new file mode 100644 index 000000000..82b051a5d Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00003 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00004 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00004 new file mode 100644 index 000000000..c027243a7 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00004 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00005 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00005 new file mode 100644 index 000000000..23e7ae70a Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00005 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00006 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00006 new file mode 100644 index 000000000..6c5a6c657 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00006 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00007 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00007 new file mode 100644 index 000000000..159a5a43f Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00007 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00008 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00008 new file mode 100644 index 000000000..9cf258c18 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00008 differ diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00009 b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00009 new file mode 100644 index 000000000..5429aab27 Binary files /dev/null and b/dhp-workflows/dhp-actionmanager/src/test/resources/entities/entities_software/part-r-00009 differ diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index cf71190a4..b5324fcc6 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -18,6 +18,7 @@ dhp-distcp dhp-graph-mapper dhp-dedup + dhp-actionmanager