align dependencies with IIS cluster

This commit is contained in:
Claudio Atzori 2019-10-29 18:10:20 +01:00
parent 5e32a4066a
commit c8bb81cd9a
11 changed files with 49 additions and 52 deletions

View File

@ -24,12 +24,13 @@ public class ArgumentApplicationParser implements Serializable {
} }
private void createOptionMap(final OptionsParameter[] configuration) { private void createOptionMap(final OptionsParameter[] configuration) {
Arrays.stream(configuration).map(conf -> Option.builder(conf.getParamName())
.longOpt(conf.getParamLongName()) Arrays.stream(configuration).map(conf -> {
.required(conf.isParamRequired()) final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
.desc(conf.getParamDescription()) o.setLongOpt(conf.getParamLongName());
.hasArg() // This option has an argument. o.setRequired(conf.isParamRequired());
.build()).forEach(options::addOption); return o;
}).forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter(); // HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true); // formatter.printHelp("myapp", null, options, null, true);
@ -38,7 +39,7 @@ public class ArgumentApplicationParser implements Serializable {
} }
public void parseArgument(final String[] args) throws Exception { public void parseArgument(final String[] args) throws Exception {
CommandLineParser parser = new DefaultParser(); CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args); CommandLine cmd = parser.parse(options, args);
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue())); Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
} }

View File

@ -17,18 +17,7 @@
<dependencies> <dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>

View File

@ -5,18 +5,19 @@ import eu.dnetlib.data.proto.OafProtos;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.Test; import org.junit.Test;
public class TestParseProtoJson { public class TestParseProtoJson {
@Test @Test
public void testParse() throws Exception { public void testParse() throws Exception {
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/schema/proto/hugeRecord.json")); final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/schema/proto/hugeRecord.json"));
OafProtos.Oaf.Builder oafBuilder =OafProtos.Oaf.newBuilder(); final OafProtos.Oaf.Builder oafBuilder = OafProtos.Oaf.newBuilder();
JsonFormat.merge(json,oafBuilder);
System.out.println(JsonFormat.printToString(oafBuilder.build())); JsonFormat jf = new JsonFormat();
jf.merge(IOUtils.toInputStream(json), oafBuilder);
OafProtos.Oaf oaf = oafBuilder.build();
System.out.println(jf.printToString(oaf));
} }
} }

View File

@ -74,7 +74,7 @@ public class GenerateNativeStoreSparkJob {
final Map<String, String> ongoingMap = new HashMap<>(); final Map<String, String> ongoingMap = new HashMap<>();
final Map<String, String> reportMap = new HashMap<>(); final Map<String, String> reportMap = new HashMap<>();
final boolean test = parser.get("isTest") == null?false: Boolean.valueOf(parser.get("isTest")); final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -86,7 +86,7 @@ public class GenerateNativeStoreSparkJob {
final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null); final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null);
final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"),provenance, dateOfCollection, totalItems, invalidRecords)) final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"), provenance, dateOfCollection, totalItems, invalidRecords))
.filter(Objects::nonNull).distinct(); .filter(Objects::nonNull).distinct();
ongoingMap.put("ongoing", "0"); ongoingMap.put("ongoing", "0");

View File

@ -17,18 +17,15 @@ public class CollectionJobTest {
@Before @Before
public void setup() throws IOException { public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection"); testDir = Files.createTempDirectory("dhp-collection");
} }
@After @After
public void teadDown() throws IOException { public void teadDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile()); FileUtils.deleteDirectory(testDir.toFile());
} }
@Test @Test
public void tesCollection () throws Exception { public void tesCollection() throws Exception {
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
GenerateNativeStoreSparkJob.main(new String[] { GenerateNativeStoreSparkJob.main(new String[] {
"-mt", "local", "-mt", "local",

View File

@ -16,8 +16,6 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_2.11</artifactId>

View File

@ -5,14 +5,19 @@ import eu.dnetlib.data.proto.FieldTypeProtos;
import eu.dnetlib.data.proto.OafProtos; import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.data.proto.ResultProtos; import eu.dnetlib.data.proto.ResultProtos;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class ProtoUtils { public class ProtoUtils {
public static OafProtos.Oaf parse(String json) throws JsonFormat.ParseException { public static OafProtos.Oaf parse(String json) throws IOException {
final OafProtos.Oaf.Builder builder = OafProtos.Oaf.newBuilder(); final OafProtos.Oaf.Builder builder = OafProtos.Oaf.newBuilder();
JsonFormat.merge(json, builder);
final JsonFormat jf = new JsonFormat();
jf.merge(IOUtils.toInputStream(json), builder);
return builder.build(); return builder.build();
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.graph; package eu.dnetlib.dhp.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -15,10 +14,8 @@ import scala.Tuple2;
public class SparkGraphImporterJob { public class SparkGraphImporterJob {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
final SparkSession spark = SparkSession final SparkSession spark = SparkSession
@ -34,7 +31,6 @@ public class SparkGraphImporterJob {
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class) final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class)
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); .map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
final JavaRDD<Oaf> oafRdd = inputRDD.filter(s -> !StringUtils.isBlank(s._2()) && !s._1().contains("@update")).map(Tuple2::_2).map(ProtoConverter::convert); final JavaRDD<Oaf> oafRdd = inputRDD.filter(s -> !StringUtils.isBlank(s._2()) && !s._1().contains("@update")).map(Tuple2::_2).map(ProtoConverter::convert);
final Encoder<Organization> organizationEncoder = Encoders.bean(Organization.class); final Encoder<Organization> organizationEncoder = Encoders.bean(Organization.class);
@ -48,7 +44,6 @@ public class SparkGraphImporterJob {
final Encoder<Relation> relationEncoder = Encoders.bean(Relation.class); final Encoder<Relation> relationEncoder = Encoders.bean(Relation.class);
spark.createDataset(oafRdd.filter(s -> s instanceof Organization).map(s -> (Organization) s).rdd(), organizationEncoder).write().save(outputPath + "/organizations"); spark.createDataset(oafRdd.filter(s -> s instanceof Organization).map(s -> (Organization) s).rdd(), organizationEncoder).write().save(outputPath + "/organizations");
spark.createDataset(oafRdd.filter(s -> s instanceof Project).map(s -> (Project) s).rdd(), projectEncoder).write().save(outputPath + "/projects"); spark.createDataset(oafRdd.filter(s -> s instanceof Project).map(s -> (Project) s).rdd(), projectEncoder).write().save(outputPath + "/projects");
spark.createDataset(oafRdd.filter(s -> s instanceof Datasource).map(s -> (Datasource) s).rdd(), datasourceEncoder).write().save(outputPath + "/datasources"); spark.createDataset(oafRdd.filter(s -> s instanceof Datasource).map(s -> (Datasource) s).rdd(), datasourceEncoder).write().save(outputPath + "/datasources");
@ -59,8 +54,5 @@ public class SparkGraphImporterJob {
spark.createDataset(oafRdd.filter(s -> s instanceof OtherResearchProducts).map(s -> (OtherResearchProducts) s).rdd(), otherResearchProductsEncoder).write().save(outputPath + "/otherResearchProducts"); spark.createDataset(oafRdd.filter(s -> s instanceof OtherResearchProducts).map(s -> (OtherResearchProducts) s).rdd(), otherResearchProductsEncoder).write().save(outputPath + "/otherResearchProducts");
spark.createDataset(oafRdd.filter(s -> s instanceof Relation).map(s -> (Relation) s).rdd(), relationEncoder).write().save(outputPath + "/relations"); spark.createDataset(oafRdd.filter(s -> s instanceof Relation).map(s -> (Relation) s).rdd(), relationEncoder).write().save(outputPath + "/relations");
} }
} }

View File

@ -1,5 +1,5 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"i", "paramLongName":"input", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, {"paramName":"i", "paramLongName":"input", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputDir", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true} {"paramName":"o", "paramLongName":"outputDir", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true}
] ]

View File

@ -1,4 +1,4 @@
<workflow-app name="distcp" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -8,6 +8,18 @@
<name>targetPath</name> <name>targetPath</name>
<description>the target path</description> <description>the target path</description>
</property> </property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters> </parameters>
<start to="MapGraphIntoDataFrame"/> <start to="MapGraphIntoDataFrame"/>
@ -24,14 +36,14 @@
<mode>cluster</mode> <mode>cluster</mode>
<name>MapGraphIntoDataFrame</name> <name>MapGraphIntoDataFrame</name>
<class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class> <class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
<jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>--num-executors 50 --conf -spark.extraListeners=com.cloudera.spark.lineage.NavigatorAppListener -spark.sql.queryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener&quot;</spark-opts> <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--input</arg><arg>${sourcePath}</arg> <arg>--input</arg><arg>${sourcePath}</arg>
<arg>--outputDir</arg><arg>${targetPath}</arg> <arg>--outputDir</arg><arg>${targetPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>

10
pom.xml
View File

@ -155,7 +155,8 @@
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>
<version>1.4</version> <version>1.2</version>
<scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
@ -208,6 +209,7 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>javax.persistence</groupId> <groupId>javax.persistence</groupId>
<artifactId>javax.persistence-api</artifactId> <artifactId>javax.persistence-api</artifactId>
@ -244,7 +246,7 @@
<dependency> <dependency>
<groupId>com.googlecode.protobuf-java-format</groupId> <groupId>com.googlecode.protobuf-java-format</groupId>
<artifactId>protobuf-java-format</artifactId> <artifactId>protobuf-java-format</artifactId>
<version>1.2</version> <version>1.4</version>
</dependency> </dependency>
<dependency> <dependency>
@ -450,8 +452,8 @@
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version> <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version> <dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version> <dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp.spark.version>2.2.0</dhp.spark.version> <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.6.5</dhp.jackson.version> <dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<scala.version>2.11.8</scala.version> <scala.version>2.11.8</scala.version>
<google.protobuf.version>2.5.0</google.protobuf.version> <google.protobuf.version>2.5.0</google.protobuf.version>