diff --git a/.gitignore b/.gitignore index 394a54797a..f77bee021c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ /*/target /target /*/build -/build \ No newline at end of file +/build +spark-warehouse \ No newline at end of file diff --git a/dhp-spark-jobs/src/main/java/eu/dnetlib/collection/GenerateNativeStoreSparkJob.java b/dhp-spark-jobs/src/main/java/eu/dnetlib/collection/GenerateNativeStoreSparkJob.java deleted file mode 100644 index d21c0a18e6..0000000000 --- a/dhp-spark-jobs/src/main/java/eu/dnetlib/collection/GenerateNativeStoreSparkJob.java +++ /dev/null @@ -1,79 +0,0 @@ -package eu.dnetlib.collection; - -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.*; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import scala.Tuple2; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import static org.apache.spark.sql.functions.array_contains; - -public class GenerateNativeStoreSparkJob { - - - public static void main(String[] args) { - - final SparkSession spark = SparkSession - .builder() - .appName("GenerateNativeStoreSparkJob") - .master("local[*]") - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaPairRDD f = sc.sequenceFile("/home/sandro/Downloads/mdstore_oai", IntWritable.class, Text.class); - - String first = f.map(a -> a._2().toString()).first(); - - - final List fields = new ArrayList<>(); - - fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); - fields.add(DataTypes.createStructField("format", DataTypes.StringType, false)); - fields.add(DataTypes.createStructField("formatName", DataTypes.StringType, true)); - fields.add(DataTypes.createStructField("body", DataTypes.StringType, true)); - - JavaRDD mdRdd = f.map((Function, Row>) item -> RowFactory.create("" + item._1().get(), "xml", null, item._2().toString())); - - final StructType schema = DataTypes.createStructType(fields); - Dataset ds = spark.createDataFrame(mdRdd, schema); - -// ds.write().save("/home/sandro/Downloads/test.parquet"); - - Publication p2 = new Publication(); - p2.setDates(Collections.singletonList("2018-09-09")); - p2.setTitles(Collections.singletonList("Titolo 2")); - p2.setIdentifiers(Collections.singletonList(new PID("pmID", "1234567"))); - - Publication p1 = new Publication(); - p1.setDates(Collections.singletonList("2018-09-09")); - p1.setTitles(Collections.singletonList("Titolo 1")); - p1.setIdentifiers(Collections.singletonList(new PID("doi", "1234567"))); - - - - - Encoder encoder = Encoders.bean(Publication.class); - - Dataset dp = spark.createDataset(Arrays.asList(p1,p2), encoder); - - - long count = dp.where(array_contains(new Column("identifiers.schema"), "doi")).count(); - - System.out.println("count = " + count); - - System.out.println(ds.count()); - - - } -} diff --git a/dhp-wf/pom.xml b/dhp-wf/pom.xml deleted file mode 100644 index 9444f0d4a7..0000000000 --- a/dhp-wf/pom.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - 4.0.0 - - - eu.dnetlib.dhp - dhp - 1.0.0-SNAPSHOT - - - dhp-wf - pom - - - - - - - - diff --git a/pom.xml b/pom.xml index 731bfe984e..c0e1048255 100644 --- a/pom.xml +++ b/pom.xml @@ -21,10 +21,8 @@ - - - - + dhp-common + dhp-workflows @@ -104,6 +102,7 @@ org.apache.hadoop hadoop-hdfs ${dhp.hadoop.version} + provided org.apache.spark @@ -118,6 +117,20 @@ provided + + org.apache.commons + commons-lang3 + ${dhp.commons.lang.version} + + + + commons-codec + commons-codec + 1.9 + + + + @@ -239,6 +252,7 @@ cdh5.9.2 2.6.0-${dhp.cdh.version} 2.2.0 + 3.5 2.11.8