forked from D-Net/dnet-hadoop
align dependencies with IIS cluster
This commit is contained in:
parent
5e32a4066a
commit
c8bb81cd9a
|
@ -24,12 +24,13 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
}
|
||||
|
||||
private void createOptionMap(final OptionsParameter[] configuration) {
|
||||
Arrays.stream(configuration).map(conf -> Option.builder(conf.getParamName())
|
||||
.longOpt(conf.getParamLongName())
|
||||
.required(conf.isParamRequired())
|
||||
.desc(conf.getParamDescription())
|
||||
.hasArg() // This option has an argument.
|
||||
.build()).forEach(options::addOption);
|
||||
|
||||
Arrays.stream(configuration).map(conf -> {
|
||||
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
||||
o.setLongOpt(conf.getParamLongName());
|
||||
o.setRequired(conf.isParamRequired());
|
||||
return o;
|
||||
}).forEach(options::addOption);
|
||||
|
||||
// HelpFormatter formatter = new HelpFormatter();
|
||||
// formatter.printHelp("myapp", null, options, null, true);
|
||||
|
@ -38,7 +39,7 @@ public class ArgumentApplicationParser implements Serializable {
|
|||
}
|
||||
|
||||
public void parseArgument(final String[] args) throws Exception {
|
||||
CommandLineParser parser = new DefaultParser();
|
||||
CommandLineParser parser = new BasicParser();
|
||||
CommandLine cmd = parser.parse(options, args);
|
||||
Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
|
||||
}
|
||||
|
|
|
@ -17,18 +17,7 @@
|
|||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-annotations</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
|
|
|
@ -5,18 +5,19 @@ import eu.dnetlib.data.proto.OafProtos;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
public class TestParseProtoJson {
|
||||
|
||||
|
||||
@Test
|
||||
public void testParse() throws Exception {
|
||||
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/schema/proto/hugeRecord.json"));
|
||||
|
||||
OafProtos.Oaf.Builder oafBuilder =OafProtos.Oaf.newBuilder();
|
||||
JsonFormat.merge(json,oafBuilder);
|
||||
final OafProtos.Oaf.Builder oafBuilder = OafProtos.Oaf.newBuilder();
|
||||
|
||||
System.out.println(JsonFormat.printToString(oafBuilder.build()));
|
||||
JsonFormat jf = new JsonFormat();
|
||||
jf.merge(IOUtils.toInputStream(json), oafBuilder);
|
||||
|
||||
OafProtos.Oaf oaf = oafBuilder.build();
|
||||
System.out.println(jf.printToString(oaf));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ public class GenerateNativeStoreSparkJob {
|
|||
final Map<String, String> ongoingMap = new HashMap<>();
|
||||
final Map<String, String> reportMap = new HashMap<>();
|
||||
|
||||
final boolean test = parser.get("isTest") == null?false: Boolean.valueOf(parser.get("isTest"));
|
||||
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
|
@ -86,7 +86,7 @@ public class GenerateNativeStoreSparkJob {
|
|||
|
||||
final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null);
|
||||
|
||||
final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"),provenance, dateOfCollection, totalItems, invalidRecords))
|
||||
final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"), provenance, dateOfCollection, totalItems, invalidRecords))
|
||||
.filter(Objects::nonNull).distinct();
|
||||
|
||||
ongoingMap.put("ongoing", "0");
|
||||
|
|
|
@ -17,18 +17,15 @@ public class CollectionJobTest {
|
|||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@After
|
||||
public void teadDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tesCollection () throws Exception {
|
||||
public void tesCollection() throws Exception {
|
||||
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
|
||||
GenerateNativeStoreSparkJob.main(new String[] {
|
||||
"-mt", "local",
|
||||
|
|
|
@ -16,8 +16,6 @@
|
|||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
|
|
|
@ -5,14 +5,19 @@ import eu.dnetlib.data.proto.FieldTypeProtos;
|
|||
import eu.dnetlib.data.proto.OafProtos;
|
||||
import eu.dnetlib.data.proto.ResultProtos;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class ProtoUtils {
|
||||
|
||||
public static OafProtos.Oaf parse(String json) throws JsonFormat.ParseException {
|
||||
public static OafProtos.Oaf parse(String json) throws IOException {
|
||||
final OafProtos.Oaf.Builder builder = OafProtos.Oaf.newBuilder();
|
||||
JsonFormat.merge(json, builder);
|
||||
|
||||
final JsonFormat jf = new JsonFormat();
|
||||
jf.merge(IOUtils.toInputStream(json), builder);
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -15,10 +14,8 @@ import scala.Tuple2;
|
|||
|
||||
public class SparkGraphImporterJob {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
|
@ -34,7 +31,6 @@ public class SparkGraphImporterJob {
|
|||
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class)
|
||||
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
|
||||
|
||||
|
||||
final JavaRDD<Oaf> oafRdd = inputRDD.filter(s -> !StringUtils.isBlank(s._2()) && !s._1().contains("@update")).map(Tuple2::_2).map(ProtoConverter::convert);
|
||||
|
||||
final Encoder<Organization> organizationEncoder = Encoders.bean(Organization.class);
|
||||
|
@ -48,7 +44,6 @@ public class SparkGraphImporterJob {
|
|||
|
||||
final Encoder<Relation> relationEncoder = Encoders.bean(Relation.class);
|
||||
|
||||
|
||||
spark.createDataset(oafRdd.filter(s -> s instanceof Organization).map(s -> (Organization) s).rdd(), organizationEncoder).write().save(outputPath + "/organizations");
|
||||
spark.createDataset(oafRdd.filter(s -> s instanceof Project).map(s -> (Project) s).rdd(), projectEncoder).write().save(outputPath + "/projects");
|
||||
spark.createDataset(oafRdd.filter(s -> s instanceof Datasource).map(s -> (Datasource) s).rdd(), datasourceEncoder).write().save(outputPath + "/datasources");
|
||||
|
@ -59,8 +54,5 @@ public class SparkGraphImporterJob {
|
|||
spark.createDataset(oafRdd.filter(s -> s instanceof OtherResearchProducts).map(s -> (OtherResearchProducts) s).rdd(), otherResearchProductsEncoder).write().save(outputPath + "/otherResearchProducts");
|
||||
|
||||
spark.createDataset(oafRdd.filter(s -> s instanceof Relation).map(s -> (Relation) s).rdd(), relationEncoder).write().save(outputPath + "/relations");
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"input", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputDir", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true}
|
||||
{"paramName":"o", "paramLongName":"outputDir", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true}
|
||||
]
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="distcp" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
|
@ -8,6 +8,18 @@
|
|||
<name>targetPath</name>
|
||||
<description>the target path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="MapGraphIntoDataFrame"/>
|
||||
|
@ -24,14 +36,14 @@
|
|||
<mode>cluster</mode>
|
||||
<name>MapGraphIntoDataFrame</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
|
||||
<jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
|
||||
<spark-opts>--num-executors 50 --conf -spark.extraListeners=com.cloudera.spark.lineage.NavigatorAppListener -spark.sql.queryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener"</spark-opts>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--input</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputDir</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
|
10
pom.xml
10
pom.xml
|
@ -155,7 +155,8 @@
|
|||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.4</version>
|
||||
<version>1.2</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -208,6 +209,7 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>javax.persistence</groupId>
|
||||
<artifactId>javax.persistence-api</artifactId>
|
||||
|
@ -244,7 +246,7 @@
|
|||
<dependency>
|
||||
<groupId>com.googlecode.protobuf-java-format</groupId>
|
||||
<artifactId>protobuf-java-format</artifactId>
|
||||
<version>1.2</version>
|
||||
<version>1.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -450,8 +452,8 @@
|
|||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
|
||||
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
||||
<dhp.spark.version>2.2.0</dhp.spark.version>
|
||||
<dhp.jackson.version>2.6.5</dhp.jackson.version>
|
||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<scala.version>2.11.8</scala.version>
|
||||
<google.protobuf.version>2.5.0</google.protobuf.version>
|
||||
|
|
Loading…
Reference in New Issue