align dependencies with IIS cluster

This commit is contained in:
Claudio Atzori 2019-10-29 18:10:20 +01:00
parent 5e32a4066a
commit c8bb81cd9a
11 changed files with 49 additions and 52 deletions

View File

@ -24,12 +24,13 @@ public class ArgumentApplicationParser implements Serializable {
}
private void createOptionMap(final OptionsParameter[] configuration) {
Arrays.stream(configuration).map(conf -> Option.builder(conf.getParamName())
.longOpt(conf.getParamLongName())
.required(conf.isParamRequired())
.desc(conf.getParamDescription())
.hasArg() // This option has an argument.
.build()).forEach(options::addOption);
Arrays.stream(configuration).map(conf -> {
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
o.setLongOpt(conf.getParamLongName());
o.setRequired(conf.isParamRequired());
return o;
}).forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true);
@ -38,7 +39,7 @@ public class ArgumentApplicationParser implements Serializable {
}
public void parseArgument(final String[] args) throws Exception {
CommandLineParser parser = new DefaultParser();
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
}

View File

@ -17,18 +17,7 @@
<dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>

View File

@ -5,18 +5,19 @@ import eu.dnetlib.data.proto.OafProtos;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
public class TestParseProtoJson {
@Test
public void testParse() throws Exception {
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/schema/proto/hugeRecord.json"));
OafProtos.Oaf.Builder oafBuilder =OafProtos.Oaf.newBuilder();
JsonFormat.merge(json,oafBuilder);
final OafProtos.Oaf.Builder oafBuilder = OafProtos.Oaf.newBuilder();
System.out.println(JsonFormat.printToString(oafBuilder.build()));
JsonFormat jf = new JsonFormat();
jf.merge(IOUtils.toInputStream(json), oafBuilder);
OafProtos.Oaf oaf = oafBuilder.build();
System.out.println(jf.printToString(oaf));
}
}

View File

@ -74,7 +74,7 @@ public class GenerateNativeStoreSparkJob {
final Map<String, String> ongoingMap = new HashMap<>();
final Map<String, String> reportMap = new HashMap<>();
final boolean test = parser.get("isTest") == null?false: Boolean.valueOf(parser.get("isTest"));
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -86,7 +86,7 @@ public class GenerateNativeStoreSparkJob {
final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null);
final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"),provenance, dateOfCollection, totalItems, invalidRecords))
final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"), provenance, dateOfCollection, totalItems, invalidRecords))
.filter(Objects::nonNull).distinct();
ongoingMap.put("ongoing", "0");

View File

@ -17,18 +17,15 @@ public class CollectionJobTest {
@Before
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@After
public void teadDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
}
@Test
public void tesCollection () throws Exception {
public void tesCollection() throws Exception {
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
GenerateNativeStoreSparkJob.main(new String[] {
"-mt", "local",

View File

@ -16,8 +16,6 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>

View File

@ -5,14 +5,19 @@ import eu.dnetlib.data.proto.FieldTypeProtos;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.data.proto.ResultProtos;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.util.stream.Collectors;
public class ProtoUtils {
public static OafProtos.Oaf parse(String json) throws JsonFormat.ParseException {
public static OafProtos.Oaf parse(String json) throws IOException {
final OafProtos.Oaf.Builder builder = OafProtos.Oaf.newBuilder();
JsonFormat.merge(json, builder);
final JsonFormat jf = new JsonFormat();
jf.merge(IOUtils.toInputStream(json), builder);
return builder.build();
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
@ -15,10 +14,8 @@ import scala.Tuple2;
public class SparkGraphImporterJob {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
@ -34,7 +31,6 @@ public class SparkGraphImporterJob {
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class)
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
final JavaRDD<Oaf> oafRdd = inputRDD.filter(s -> !StringUtils.isBlank(s._2()) && !s._1().contains("@update")).map(Tuple2::_2).map(ProtoConverter::convert);
final Encoder<Organization> organizationEncoder = Encoders.bean(Organization.class);
@ -48,7 +44,6 @@ public class SparkGraphImporterJob {
final Encoder<Relation> relationEncoder = Encoders.bean(Relation.class);
spark.createDataset(oafRdd.filter(s -> s instanceof Organization).map(s -> (Organization) s).rdd(), organizationEncoder).write().save(outputPath + "/organizations");
spark.createDataset(oafRdd.filter(s -> s instanceof Project).map(s -> (Project) s).rdd(), projectEncoder).write().save(outputPath + "/projects");
spark.createDataset(oafRdd.filter(s -> s instanceof Datasource).map(s -> (Datasource) s).rdd(), datasourceEncoder).write().save(outputPath + "/datasources");
@ -59,8 +54,5 @@ public class SparkGraphImporterJob {
spark.createDataset(oafRdd.filter(s -> s instanceof OtherResearchProducts).map(s -> (OtherResearchProducts) s).rdd(), otherResearchProductsEncoder).write().save(outputPath + "/otherResearchProducts");
spark.createDataset(oafRdd.filter(s -> s instanceof Relation).map(s -> (Relation) s).rdd(), relationEncoder).write().save(outputPath + "/relations");
}
}

View File

@ -1,4 +1,4 @@
<workflow-app name="distcp" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
@ -8,6 +8,18 @@
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="MapGraphIntoDataFrame"/>
@ -24,14 +36,14 @@
<mode>cluster</mode>
<name>MapGraphIntoDataFrame</name>
<class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
<jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
<spark-opts>--num-executors 50 --conf -spark.extraListeners=com.cloudera.spark.lineage.NavigatorAppListener -spark.sql.queryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener&quot;</spark-opts>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--input</arg><arg>${sourcePath}</arg>
<arg>--outputDir</arg><arg>${targetPath}</arg>
</spark>
<ok to="End"/>
<error to="kill"/>
<error to="Kill"/>
</action>
<end name="End"/>

10
pom.xml
View File

@ -155,7 +155,8 @@
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
<version>1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -208,6 +209,7 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.persistence</groupId>
<artifactId>javax.persistence-api</artifactId>
@ -244,7 +246,7 @@
<dependency>
<groupId>com.googlecode.protobuf-java-format</groupId>
<artifactId>protobuf-java-format</artifactId>
<version>1.2</version>
<version>1.4</version>
</dependency>
<dependency>
@ -450,8 +452,8 @@
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp.spark.version>2.2.0</dhp.spark.version>
<dhp.jackson.version>2.6.5</dhp.jackson.version>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<scala.version>2.11.8</scala.version>
<google.protobuf.version>2.5.0</google.protobuf.version>