WIP: subjectPropagation #269
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -12,14 +12,13 @@ import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class PropagationConstant {
|
public class PropagationConstant {
|
||||||
|
|
||||||
|
@ -237,4 +236,30 @@ public class PropagationConstant {
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <R extends Oaf> Dataset<R> readOafKryoPath(
|
||||||
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.kryo(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Class[] getModelClasses() {
|
||||||
|
List<Class<?>> modelClasses = Lists.newArrayList(ModelSupport.getOafModelClasses());
|
||||||
|
modelClasses
|
||||||
|
.addAll(
|
||||||
|
Lists
|
||||||
|
.newArrayList(
|
||||||
|
Result.class,
|
||||||
|
Qualifier.class,
|
||||||
|
DataInfo.class,
|
||||||
|
Publication.class,
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Dataset.class,
|
||||||
|
Software.class,
|
||||||
|
OtherResearchProduct.class,
|
||||||
|
Subject.class,
|
||||||
|
AccessRight.class));
|
||||||
|
return modelClasses.toArray(new Class[] {});
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,7 +70,7 @@ public class PrepareResultResultStep1 implements Serializable {
|
||||||
|
|
||||||
final List<String> allowedSemRel = Arrays
|
final List<String> allowedSemRel = Arrays
|
||||||
.asList(
|
.asList(
|
||||||
parser.get("allowedSemRel").split(";"))
|
parser.get("allowedsemrels").split(";"))
|
||||||
.stream()
|
.stream()
|
||||||
.map(s -> s.toLowerCase())
|
.map(s -> s.toLowerCase())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
@ -98,7 +98,7 @@ public class PrepareResultResultStep1 implements Serializable {
|
||||||
Dataset<R> result = readPath(spark, inputPath + "/" + resultType, resultClazz)
|
Dataset<R> result = readPath(spark, inputPath + "/" + resultType, resultClazz)
|
||||||
.filter(
|
.filter(
|
||||||
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||||
!r.getDataInfo().getInvisible() &&
|
!r.getDataInfo().getInvisible() && Optional.ofNullable(r.getSubject()).isPresent() &&
|
||||||
r
|
r
|
||||||
.getSubject()
|
.getSubject()
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -116,22 +116,28 @@ public class PrepareResultResultStep1 implements Serializable {
|
||||||
(MapGroupsFunction<String, Tuple2<R, Relation>, ResultSubjectList>) (k,
|
(MapGroupsFunction<String, Tuple2<R, Relation>, ResultSubjectList>) (k,
|
||||||
it) -> getResultSubjectList(subjectClassList, k, it),
|
it) -> getResultSubjectList(subjectClassList, k, it),
|
||||||
Encoders.bean(ResultSubjectList.class))
|
Encoders.bean(ResultSubjectList.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "/" + resultType);
|
.json(outputPath + "/" + resultType);
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private static <R extends Result> ResultSubjectList getResultSubjectList(List<String> subjectClassList, String k,
|
private static <R extends Result> ResultSubjectList getResultSubjectList(List<String> subjectClassList, String k,
|
||||||
Iterator<Tuple2<R, Relation>> it) {
|
Iterator<Tuple2<R, Relation>> it) {
|
||||||
|
Tuple2<R, Relation> first = it.next();
|
||||||
|
if (!Optional.ofNullable(first._1()).isPresent()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
ResultSubjectList rsl = new ResultSubjectList();
|
ResultSubjectList rsl = new ResultSubjectList();
|
||||||
rsl.setResId(k);
|
rsl.setResId(k);
|
||||||
Tuple2<R, Relation> first = it.next();
|
|
||||||
List<SubjectInfo> sbjInfo = new ArrayList<>();
|
List<SubjectInfo> sbjInfo = new ArrayList<>();
|
||||||
Set<String> subjectSet = new HashSet<>();
|
Set<String> subjectSet = new HashSet<>();
|
||||||
extracted(subjectClassList, first._1().getSubject(), sbjInfo, subjectSet);
|
extracted(subjectClassList, first._1().getSubject(), sbjInfo, subjectSet);
|
||||||
it.forEachRemaining(t2 -> extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet));
|
it.forEachRemaining(t2 -> {
|
||||||
|
if (Optional.ofNullable(t2._1()).isPresent())
|
||||||
|
extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet);
|
||||||
|
});
|
||||||
rsl.setSubjectList(sbjInfo);
|
rsl.setSubjectList(sbjInfo);
|
||||||
return rsl;
|
return rsl;
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,7 @@ public class SparkSubjectPropagationStep2 implements Serializable {
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
@ -58,14 +59,15 @@ public class SparkSubjectPropagationStep2 implements Serializable {
|
||||||
final String resultType = parser.get("resultType");
|
final String resultType = parser.get("resultType");
|
||||||
log.info("resultType: {}", resultType);
|
log.info("resultType: {}", resultType);
|
||||||
|
|
||||||
final String inputPath = parser.get("inputPath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
|
conf.registerKryoClasses(getModelClasses());
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
|
@ -83,7 +85,11 @@ public class SparkSubjectPropagationStep2 implements Serializable {
|
||||||
Class<R> resultClazz,
|
Class<R> resultClazz,
|
||||||
String resultType) {
|
String resultType) {
|
||||||
|
|
||||||
Dataset<R> results = readPath(spark, inputPath + "/" + resultType, resultClazz);
|
Dataset<Tuple2<String, R>> results = readOafKryoPath(spark, inputPath + "/" + resultType, resultClazz)
|
||||||
|
.map(
|
||||||
|
(MapFunction<R, Tuple2<String, R>>) r -> new Tuple2(r.getId(), r),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(resultClazz)));
|
||||||
|
|
||||||
Dataset<ResultSubjectList> preparedResult = readPath(
|
Dataset<ResultSubjectList> preparedResult = readPath(
|
||||||
spark, preparedPath + "/publication", ResultSubjectList.class)
|
spark, preparedPath + "/publication", ResultSubjectList.class)
|
||||||
.union(readPath(spark, preparedPath + "/dataset", ResultSubjectList.class))
|
.union(readPath(spark, preparedPath + "/dataset", ResultSubjectList.class))
|
||||||
|
@ -93,20 +99,26 @@ public class SparkSubjectPropagationStep2 implements Serializable {
|
||||||
results
|
results
|
||||||
.joinWith(
|
.joinWith(
|
||||||
preparedResult,
|
preparedResult,
|
||||||
results.col("id").equalTo(preparedResult.col("resId")),
|
results.col("_1").equalTo(preparedResult.col("resId")),
|
||||||
"left")
|
"left")
|
||||||
.map((MapFunction<Tuple2<R, ResultSubjectList>, R>) t2 -> {
|
.map((MapFunction<Tuple2<Tuple2<String, R>, ResultSubjectList>, String>) t2 -> {
|
||||||
R res = t2._1();
|
R res = t2._1()._2();
|
||||||
|
// estraggo le tipologie di subject dal result
|
||||||
|
Map<String, List<String>> resultMap = new HashMap<>();
|
||||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||||
// estraggo le tipologie di subject dal result
|
if(Optional.ofNullable(res.getSubject()).isPresent()){
|
||||||
Map<String, List<String>> resultMap = new HashMap<>();
|
res.getSubject().stream().forEach(s -> {
|
||||||
res.getSubject().stream().forEach(s -> {
|
String cid = s.getQualifier().getClassid();
|
||||||
String cid = s.getQualifier().getClassid();
|
if(!cid.equals(ModelConstants.DNET_SUBJECT_KEYWORD)){
|
||||||
if (!resultMap.containsKey(cid)) {
|
if (!resultMap.containsKey(cid)) {
|
||||||
resultMap.put(cid, new ArrayList<>());
|
resultMap.put(cid, new ArrayList<>());
|
||||||
}
|
}
|
||||||
resultMap.get(cid).add(s.getValue());
|
resultMap.get(cid).add(s.getValue());
|
||||||
});
|
}
|
||||||
|
});
|
||||||
|
}else{
|
||||||
|
res.setSubject(new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
// Remove from the list all the subjects with the same class already present in the result
|
// Remove from the list all the subjects with the same class already present in the result
|
||||||
List<String> distinctClassId = t2
|
List<String> distinctClassId = t2
|
||||||
|
@ -142,12 +154,12 @@ public class SparkSubjectPropagationStep2 implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return res;
|
return OBJECT_MAPPER.writeValueAsString(res);
|
||||||
}, Encoders.bean(resultClazz))
|
}, Encoders.STRING())
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(workingPath + "/" + resultType);
|
.text(workingPath + "/" + resultType);
|
||||||
|
|
||||||
readPath(spark, workingPath + "/" + resultType, resultClazz)
|
readPath(spark, workingPath + "/" + resultType, resultClazz)
|
||||||
.write()
|
.write()
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
{
|
{
|
||||||
"paramName":"asr",
|
"paramName":"asr",
|
||||||
"paramLongName":"allowedSemRel",
|
"paramLongName":"allowedsemrels",
|
||||||
"paramDescription": "the set of semantic relations between the results to be exploited to perform the propagation",
|
"paramDescription": "the set of semantic relations between the results to be exploited to perform the propagation",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
|
|
@ -13,8 +13,8 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName":"ip",
|
"paramName":"sp",
|
||||||
"paramLongName":"inputPath",
|
"paramLongName":"sourcePath",
|
||||||
"paramDescription": "the path of the input graph",
|
"paramDescription": "the path of the input graph",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
|
|
@ -48,7 +48,7 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>sparkExecutorMemory</name>
|
||||||
<value>6G</value>
|
<value>10G</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorCores</name>
|
<name>sparkExecutorCores</name>
|
||||||
|
|
|
@ -1,12 +1,4 @@
|
||||||
<workflow-app name="subject_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="subject_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
|
||||||
<!-- {-->
|
|
||||||
<!-- "paramName": "out",-->
|
|
||||||
<!-- "paramLongName": "outputPath",-->
|
|
||||||
<!-- "paramDescription": "the path used to store temporary output files",-->
|
|
||||||
<!-- "paramRequired": true-->
|
|
||||||
<!-- }-->
|
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
|
@ -16,14 +8,6 @@
|
||||||
<name>subjectlist</name>
|
<name>subjectlist</name>
|
||||||
<description>the list of subject classid to propagate (split by ;)</description>
|
<description>the list of subject classid to propagate (split by ;)</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>resultType</name>
|
|
||||||
<description>the result tapy</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>resultTableName</name>
|
|
||||||
<description>the class of the result</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>allowedsemrels</name>
|
<name>allowedsemrels</name>
|
||||||
<description>the allowed semantics </description>
|
<description>the allowed semantics </description>
|
||||||
|
@ -64,14 +48,14 @@
|
||||||
<path start="prepare_subject_propagation_publication"/>
|
<path start="prepare_subject_propagation_publication"/>
|
||||||
<path start="prepare_subject_propagation_dataset"/>
|
<path start="prepare_subject_propagation_dataset"/>
|
||||||
<path start="prepare_subject_propagation_software"/>
|
<path start="prepare_subject_propagation_software"/>
|
||||||
<path start="prepare_subject_propagation_otherresearchproduct"/>
|
<path start="prepare_subject_propagation_orp"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="prepare_subject_propagation_publication">
|
<action name="prepare_subject_propagation_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareProjectResultsAssociation</name>
|
<name>PrepareSubjectResultsAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -98,7 +82,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareProjectResultsAssociation</name>
|
<name>PrepareSubjectResultsAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -125,7 +109,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareProjectResultsAssociation</name>
|
<name>PrepareSubjectResultsAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -152,7 +136,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareProjectResultsAssociation</name>
|
<name>PrepareSubjectResultsAssociation</name>
|
||||||
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -188,12 +172,12 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ProjectToResultPropagation</name>
|
<name>SubjectToResultPropagation</name>
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=8G
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -201,8 +185,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
@ -217,8 +202,8 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ProjectToResultPropagation</name>
|
<name>SubjectToResultPropagation</name>
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
@ -230,8 +215,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
@ -246,8 +232,8 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ProjectToResultPropagation</name>
|
<name>SubjectToResultPropagation</name>
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
@ -259,8 +245,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
@ -275,8 +262,8 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ProjectToResultPropagation</name>
|
<name>SubjectToResultPropagation</name>
|
||||||
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
|
<class>eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
@ -288,8 +275,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/working</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
@ -300,7 +288,7 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_prepare" to="End"/>
|
<join name="wait_propagation" to="End"/>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ public class SubjectPreparationJobTest {
|
||||||
PrepareResultResultStep1
|
PrepareResultResultStep1
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-allowedSemRel",
|
"-allowedsemrels",
|
||||||
"IsSupplementedBy;IsSupplementTo;IsPreviousVersionOf;IsNewVersionOf;IsIdenticalTo;Obsoletes;IsObsoletedBy;IsVersionOf",
|
"IsSupplementedBy;IsSupplementTo;IsPreviousVersionOf;IsNewVersionOf;IsIdenticalTo;Obsoletes;IsObsoletedBy;IsVersionOf",
|
||||||
"-subjectlist", "fos;sdg",
|
"-subjectlist", "fos;sdg",
|
||||||
"-resultType", "publication",
|
"-resultType", "publication",
|
||||||
|
|
|
@ -76,7 +76,7 @@ public class SubjectPropagationJobTest {
|
||||||
.getResource("/eu/dnetlib/dhp/subjectpropagation/preparedInfo")
|
.getResource("/eu/dnetlib/dhp/subjectpropagation/preparedInfo")
|
||||||
.getPath(),
|
.getPath(),
|
||||||
"-resultType", "publication",
|
"-resultType", "publication",
|
||||||
"-inputPath", getClass()
|
"-sourcePath", getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/subjectpropagation")
|
.getResource("/eu/dnetlib/dhp/subjectpropagation")
|
||||||
.getPath(),
|
.getPath(),
|
||||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
|
Loading…
Reference in New Issue