forked from D-Net/dnet-hadoop
country propagation for results collected from institutional repositories
This commit is contained in:
parent
4955be0197
commit
c7bc73aedf
|
@ -0,0 +1,15 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.0.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>dhp-bulktag</artifactId>
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,39 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.0.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>dhp-propagation</artifactId>
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,308 @@
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.Optional;
|
||||||
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public class SparkCountryPropagationJob {
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCountryPropagationJob.class.getResourceAsStream("/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
final SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkCountryPropagationJob.class.getSimpleName())
|
||||||
|
.master(parser.get("master"))
|
||||||
|
.enableHiveSupport()
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
|
||||||
|
|
||||||
|
List<String> whitelist = new ArrayList<>();
|
||||||
|
List<String> allowedtypes = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// JavaPairRDD<String, TypedRow> results = getResults(sc, inputPath);
|
||||||
|
// sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
|
||||||
|
// .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
|
||||||
|
// .map(oaf -> new TypedRow().setType("dataset").setDeleted(oaf.getDataInfo().getDeletedbyinference()).setOaf(oaf.toString()).setSourceId(oaf.getId()))
|
||||||
|
// .mapToPair(toPair())
|
||||||
|
// .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
|
||||||
|
// .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
|
||||||
|
// .map(oaf -> new TypedRow().setType("otherresearchproduct").setDeleted(oaf.getDataInfo().getDeletedbyinference()).setOaf(oaf.toString()).setSourceId(oaf.getId()))
|
||||||
|
// .mapToPair(toPair()))
|
||||||
|
// .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
|
||||||
|
// .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
|
||||||
|
// .map(oaf -> new TypedRow().setType("software").setDeleted(oaf.getDataInfo().getDeletedbyinference()).setOaf(oaf.toString()).setSourceId(oaf.getId()))
|
||||||
|
// .mapToPair(toPair()))
|
||||||
|
// .union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||||
|
// .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
|
||||||
|
// .map(oaf -> new TypedRow().setType("publication").setDeleted(oaf.getDataInfo().getDeletedbyinference()).setOaf(oaf.toString()).setSourceId(oaf.getId()))
|
||||||
|
// .mapToPair(toPair()));
|
||||||
|
//
|
||||||
|
//
|
||||||
|
JavaPairRDD<String, TypedRow> organizations = sc.sequenceFile(inputPath + "/organization", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class))
|
||||||
|
.filter(org -> !org.getDataInfo().getDeletedbyinference())
|
||||||
|
.map(org -> new TypedRow().setSourceId(org.getId()).setCountry(org.getCountry().getClassid()))
|
||||||
|
.mapToPair(toPair());
|
||||||
|
|
||||||
|
JavaPairRDD<String, TypedRow> organization_datasource = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class))
|
||||||
|
.filter(r -> !r.getDataInfo().getDeletedbyinference())
|
||||||
|
.filter(r -> "datasourceOrganization".equals(r.getRelClass()) && "isProvidedBy".equals(r.getRelType()))
|
||||||
|
.map(r -> new TypedRow().setSourceId(r.getSource()).setTargetId(r.getTarget()))
|
||||||
|
.mapToPair(toPair());
|
||||||
|
|
||||||
|
JavaPairRDD<String, TypedRow> datasources = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class))
|
||||||
|
.filter(ds -> whitelist.contains(ds.getId()) || allowedtypes.contains(ds.getDatasourcetype().getClassid()))
|
||||||
|
.map(ds -> new TypedRow().setSourceId(ds.getId()))
|
||||||
|
.mapToPair(toPair());
|
||||||
|
|
||||||
|
|
||||||
|
JavaRDD<Publication> publications = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class));
|
||||||
|
JavaRDD<Dataset> datasets = sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class));
|
||||||
|
JavaRDD<Software> software = sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class));
|
||||||
|
JavaRDD<OtherResearchProduct> other = sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class));
|
||||||
|
|
||||||
|
JavaPairRDD<String, TypedRow> datasource_results = publications
|
||||||
|
.map(oaf -> getTypedRows(oaf))
|
||||||
|
.flatMapToPair(f -> {
|
||||||
|
ArrayList<Tuple2<String, TypedRow>> ret = new ArrayList<>();
|
||||||
|
for (TypedRow t : f) {
|
||||||
|
ret.add(new Tuple2<>(t.getSourceId(), t));
|
||||||
|
}
|
||||||
|
return ret.iterator();
|
||||||
|
})
|
||||||
|
.union(datasets
|
||||||
|
.map(oaf -> getTypedRows(oaf))
|
||||||
|
.flatMapToPair(f -> {
|
||||||
|
ArrayList<Tuple2<String, TypedRow>> ret = new ArrayList<>();
|
||||||
|
for (TypedRow t : f) {
|
||||||
|
ret.add(new Tuple2<>(t.getSourceId(), t));
|
||||||
|
}
|
||||||
|
return ret.iterator();
|
||||||
|
}))
|
||||||
|
.union(software
|
||||||
|
.map(oaf -> getTypedRows(oaf))
|
||||||
|
.flatMapToPair(f -> {
|
||||||
|
ArrayList<Tuple2<String, TypedRow>> ret = new ArrayList<>();
|
||||||
|
for (TypedRow t : f) {
|
||||||
|
ret.add(new Tuple2<>(t.getSourceId(), t));
|
||||||
|
}
|
||||||
|
return ret.iterator();
|
||||||
|
}))
|
||||||
|
.union(other
|
||||||
|
.map(oaf -> getTypedRows(oaf))
|
||||||
|
.flatMapToPair(f -> {
|
||||||
|
ArrayList<Tuple2<String, TypedRow>> ret = new ArrayList<>();
|
||||||
|
for (TypedRow t : f) {
|
||||||
|
ret.add(new Tuple2<>(t.getSourceId(), t));
|
||||||
|
}
|
||||||
|
return ret.iterator();
|
||||||
|
}));
|
||||||
|
|
||||||
|
JavaPairRDD<String, OafEntity> pubs = publications.mapToPair(p -> new Tuple2<>(p.getId(),p));
|
||||||
|
JavaPairRDD<String, OafEntity> dss = datasets.mapToPair(p -> new Tuple2<>(p.getId(),p));
|
||||||
|
JavaPairRDD<String, OafEntity> sfw = software.mapToPair(p -> new Tuple2<>(p.getId(),p));
|
||||||
|
JavaPairRDD<String, OafEntity> orp = other.mapToPair(p -> new Tuple2<>(p.getId(),p));
|
||||||
|
|
||||||
|
JavaPairRDD<String, TypedRow> datasource_country = organizations.join(organization_datasource)
|
||||||
|
.map(x -> x._2()._1().setSourceId(x._2()._2().getTargetId())) // (OrganizationId,(TypedRow for Organization, TypedRow for Relation)
|
||||||
|
.mapToPair(toPair()); //(DatasourceId, TypedRowforOrganziation)
|
||||||
|
|
||||||
|
|
||||||
|
JavaPairRDD<String, TypedRow> alloweddatasources_country = datasources.join(datasource_country)
|
||||||
|
.mapToPair(ds -> new Tuple2<>(ds._1(), ds._2()._2()));
|
||||||
|
|
||||||
|
|
||||||
|
JavaPairRDD<String,TypedRow> toupdateresult = alloweddatasources_country.join(datasource_results)
|
||||||
|
.map(u -> u._2()._2().setCountry(u._2()._1().getCountry()))
|
||||||
|
.mapToPair(toPair())
|
||||||
|
.reduceByKey((a, p) -> {
|
||||||
|
if (a == null) {
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
if (p == null) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
HashSet<String> countries = new HashSet();
|
||||||
|
countries.addAll(Arrays.asList(a.getCountry().split(";")));
|
||||||
|
countries.addAll(Arrays.asList(p.getCountry().split(";")));
|
||||||
|
String country = new String();
|
||||||
|
for (String c : countries) {
|
||||||
|
country += c + ";";
|
||||||
|
}
|
||||||
|
|
||||||
|
return a.setCountry(country);
|
||||||
|
});
|
||||||
|
|
||||||
|
updateResult(pubs, toupdateresult, outputPath, "publication");
|
||||||
|
updateResult(dss, toupdateresult, outputPath, "dataset");
|
||||||
|
updateResult(sfw, toupdateresult, outputPath, "software");
|
||||||
|
updateResult(orp, toupdateresult, outputPath, "otherresearchproduct");
|
||||||
|
//we use leftOuterJoin because we want to rebuild the entire structure
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateResult(JavaPairRDD<String, OafEntity> results, JavaPairRDD<String, TypedRow> toupdateresult, String outputPath, String type) {
|
||||||
|
results.leftOuterJoin(toupdateresult)
|
||||||
|
.map(c -> {
|
||||||
|
OafEntity oaf = c._2()._1();
|
||||||
|
List<Qualifier> qualifierList = null;
|
||||||
|
if (oaf.getClass() == Publication.class) {
|
||||||
|
qualifierList = ((Publication) oaf).getCountry();
|
||||||
|
|
||||||
|
}
|
||||||
|
if (oaf.getClass() == Dataset.class){
|
||||||
|
qualifierList = ((Dataset) oaf).getCountry();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == Software.class){
|
||||||
|
qualifierList = ((Software) oaf).getCountry();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == OtherResearchProduct.class){
|
||||||
|
qualifierList = ((OtherResearchProduct) oaf).getCountry();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c._2()._2().isPresent()) {
|
||||||
|
HashSet<String> countries = new HashSet<>();
|
||||||
|
for (Qualifier country : qualifierList) {
|
||||||
|
countries.add(country.getClassid());
|
||||||
|
}
|
||||||
|
TypedRow t = c._2()._2().get();
|
||||||
|
|
||||||
|
for (String country : t.getCountry().split(";")) {
|
||||||
|
if (!countries.contains(country)) {
|
||||||
|
Qualifier q = new Qualifier();
|
||||||
|
q.setClassid(country);
|
||||||
|
qualifierList.add(q);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (oaf.getClass() == Publication.class) {
|
||||||
|
((Publication) oaf).setCountry(qualifierList);
|
||||||
|
return (Publication) oaf;
|
||||||
|
|
||||||
|
}
|
||||||
|
if (oaf.getClass() == Dataset.class){
|
||||||
|
((Dataset) oaf).setCountry(qualifierList);
|
||||||
|
return (Dataset) oaf;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == Software.class){
|
||||||
|
((Software) oaf).setCountry(qualifierList);
|
||||||
|
return (Software) oaf;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == OtherResearchProduct.class){
|
||||||
|
((OtherResearchProduct) oaf).setCountry(qualifierList);
|
||||||
|
return (OtherResearchProduct) oaf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.map(p -> new ObjectMapper().writeValueAsString(p))
|
||||||
|
.saveAsTextFile(outputPath+"/"+type);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<TypedRow> getTypedRows(OafEntity oaf) {
|
||||||
|
List<TypedRow> lst = new ArrayList<>();
|
||||||
|
Set<String> datasources_provenance = new HashSet<>();
|
||||||
|
List<Instance> instanceList = null;
|
||||||
|
String type = "";
|
||||||
|
if (oaf.getClass() == Publication.class) {
|
||||||
|
instanceList = ((Publication) oaf).getInstance();
|
||||||
|
type = "publication";
|
||||||
|
}
|
||||||
|
if (oaf.getClass() == Dataset.class){
|
||||||
|
instanceList = ((Dataset)oaf).getInstance();
|
||||||
|
type = "dataset";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == Software.class){
|
||||||
|
instanceList = ((Software)oaf).getInstance();
|
||||||
|
type = "software";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaf.getClass() == OtherResearchProduct.class){
|
||||||
|
instanceList = ((OtherResearchProduct)oaf).getInstance();
|
||||||
|
type = "otherresearchproduct";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (Instance i : instanceList) {
|
||||||
|
datasources_provenance.add(i.getCollectedfrom().getKey());
|
||||||
|
datasources_provenance.add(i.getHostedby().getKey());
|
||||||
|
}
|
||||||
|
for (String dsId : datasources_provenance) {
|
||||||
|
lst.add(new TypedRow().setSourceId(dsId).setTargetId(oaf.getId()).setType(type));
|
||||||
|
}
|
||||||
|
return lst;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static JavaPairRDD<String, TypedRow> getResults(JavaSparkContext sc , String inputPath){
|
||||||
|
|
||||||
|
return
|
||||||
|
sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
|
||||||
|
.filter(ds -> !ds.getDataInfo().getDeletedbyinference())
|
||||||
|
.map(oaf -> new TypedRow().setType("dataset").setSourceId(oaf.getId()))
|
||||||
|
.mapToPair(toPair())
|
||||||
|
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
|
||||||
|
.filter(o -> !o.getDataInfo().getDeletedbyinference())
|
||||||
|
.map(oaf -> new TypedRow().setType("otherresearchproduct").setSourceId(oaf.getId()))
|
||||||
|
.mapToPair(toPair()))
|
||||||
|
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
|
||||||
|
.filter(s -> !s.getDataInfo().getDeletedbyinference())
|
||||||
|
.map(oaf -> new TypedRow().setType("software").setSourceId(oaf.getId()))
|
||||||
|
.mapToPair(toPair()))
|
||||||
|
.union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||||
|
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
|
||||||
|
.filter(p -> !p.getDataInfo().getDeletedbyinference())
|
||||||
|
.map(oaf -> new TypedRow().setType("publication").setSourceId(oaf.getId()))
|
||||||
|
.mapToPair(toPair()));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private static PairFunction<TypedRow, String, TypedRow> toPair() {
|
||||||
|
return e -> new Tuple2<>( e.getSourceId(), e);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class TypedRow implements Serializable {
|
||||||
|
private String sourceId;
|
||||||
|
private String targetId;
|
||||||
|
private String type;
|
||||||
|
private String country;
|
||||||
|
|
||||||
|
public List<String> getAccumulator() {
|
||||||
|
return accumulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setAccumulator(List<String> accumulator) {
|
||||||
|
this.accumulator = accumulator;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> accumulator;
|
||||||
|
|
||||||
|
|
||||||
|
public void add(String a){
|
||||||
|
if (accumulator == null){
|
||||||
|
accumulator = new ArrayList<>();
|
||||||
|
}
|
||||||
|
accumulator.add(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<String> getAccumulatorIterator(){
|
||||||
|
return accumulator.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCountry() {
|
||||||
|
return country;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setCountry(String country) {
|
||||||
|
this.country = country;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourceId() {
|
||||||
|
return sourceId;
|
||||||
|
}
|
||||||
|
public TypedRow setSourceId(String sourceId) {
|
||||||
|
this.sourceId = sourceId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public String getTargetId() {
|
||||||
|
return targetId;
|
||||||
|
}
|
||||||
|
public TypedRow setTargetId(String targetId) {
|
||||||
|
this.targetId = targetId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
public TypedRow setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,3 @@
|
||||||
|
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||||
|
sparkDriverMemory=15G
|
||||||
|
sparkExecutorMemory=15G
|
|
@ -0,0 +1,5 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||||
|
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path of the sequential file to write", "paramRequired": true}
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,55 @@
|
||||||
|
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the output path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="MapGraphIntoDataFrame"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="CountryPropagation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>CountryPropagation</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
|
<jar>dhp-graph-countrypropagation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores ${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||||
|
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||||
|
--conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -18,6 +18,8 @@
|
||||||
<module>dhp-distcp</module>
|
<module>dhp-distcp</module>
|
||||||
<module>dhp-graph-mapper</module>
|
<module>dhp-graph-mapper</module>
|
||||||
<module>dhp-dedup</module>
|
<module>dhp-dedup</module>
|
||||||
|
<module>dhp-bulktag</module>
|
||||||
|
<module>dhp-propagation</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<pluginRepositories>
|
<pluginRepositories>
|
||||||
|
|
Loading…
Reference in New Issue