refactoring

This commit is contained in:
Miriam Baglioni 2023-07-01 12:32:48 +02:00
parent d9993ee5a2
commit d35ffd201d
38 changed files with 338 additions and 2410 deletions

View File

@ -26,7 +26,7 @@ public class MakeTar implements Serializable {
.toString(
MakeTar.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_maketar_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -36,280 +36,30 @@ public class ResultMapper implements Serializable {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
try {
addTypeSpecificInformation(out, input, ort);
Optional
.ofNullable(input.getAuthor())
.ifPresent(
ats -> out.setAuthor(ats.stream().map(ResultMapper::getAuthor).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(oar.get().getClassid())) {
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(oar.get().getClassid());
out
.setBestaccessright(
BestAccessRight
.newInstance(
code,
Constants.COAR_CODE_LABEL_MAP.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
ResultCountry country = new ResultCountry();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
addTypeSpecificInformation(out, input, ort.get());
mapAuthor(out, input);
mapAccessRight(out, input);
mapContributor(out, input);
mapCountry(out, input);
mapCoverage(out, input);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
if (Optional.ofNullable(input.getEmbargoenddate()).isPresent()) {
out.setEmbargoenddate(input.getEmbargoenddate().getValue());
}
if (Optional.ofNullable(input.getMeasures()).isPresent()) {
Indicator i = new Indicator();
UsageCounts uc = new UsageCounts();
input.getMeasures().forEach(m -> {
if (m.getId().equals("downloads")) {
uc.setDownloads(m.getUnit().get(0).getValue());
}
if (m.getId().equals("views")) {
uc.setViews(m.getUnit().get(0).getValue());
}
});
if (!uc.isEmpty()) {
i.setUsageCounts(uc);
out.setIndicator(i);
}
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
mapDescription(out, input);
mapEmbrargo(out, input);
mapMeasure(out, input);
mapFormat(out, input);
out.setId(input.getId());
out.setOriginalId(new ArrayList<>());
Optional
.ofNullable(input.getOriginalId())
.ifPresent(
v -> out
.setOriginalId(
input
.getOriginalId()
.stream()
.filter(s -> !s.startsWith("50|"))
.collect(Collectors.toList())));
if (Optional
.ofNullable(input.getInstance())
.isPresent()) {
out
.setInstance(
input
.getInstance()
.stream()
.map(i -> getCommunityInstance(i))
.collect(Collectors.toList()));
}
if (Optional.ofNullable(input.getLanguage()).isPresent()) {
out
.setLanguage(
Language.newInstance(input.getLanguage().getClassid(), input.getLanguage().getClassname()));
}
if (Optional.ofNullable(input.getLastupdatetimestamp()).isPresent()) {
out.setLastupdatetimestamp(input.getLastupdatetimestamp());
}
if (Optional.ofNullable(input.getTitle()).isPresent()) {
List<StructuredProperty> iTitle = input
.getTitle()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = input
.getTitle()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> out
.setPid(
value
.stream()
.map(
p -> ResultPid
.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
if (Optional.ofNullable(input.getDateofacceptance()).isPresent()) {
out.setPublicationdate(input.getDateofacceptance().getValue());
}
if (Optional.ofNullable(input.getPublisher()).isPresent()) {
out.setPublisher(input.getPublisher().getValue());
}
Optional
.ofNullable(input.getSource())
.ifPresent(
value -> out.setSource(value.stream().map(Field::getValue).collect(Collectors.toList())));
if (Optional.ofNullable(input.getSubject()).isPresent()) {
out.setSubject(createSubjectMap(input));
out
.setKeywords(
input
.getSubject()
.stream()
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("keyword") &&
!s.getValue().equalsIgnoreCase("EOSC::RO-crate"))
.map(s -> s.getValue())
.collect(Collectors.toList()));
if (Optional.ofNullable(input.getEoscifguidelines()).isPresent()) {
out
.setEoscIF(
input
.getEoscifguidelines()
.stream()
.map(
eig -> EoscInteroperabilityFramework
.newInstance(
eig.getCode(), eig.getLabel(), eig.getUrl(),
eig.getSemanticRelation()))
.collect(Collectors.toList()));
}
}
mapOriginalId(out, input);
mapInstance(out, input);
mapLamguage(out, input);
mapLastUpdateTimestamp(out, input);
mapTitle(out, input);
mapPid(out, input);
mapAcceptanceDate(out, input);
mapPublisher(out, input);
mapSource(out, input);
mapSubject(out, input);
out.setType(input.getResulttype().getClassid());
Set<String> communities = communityMap.keySet();
List<Context> contextList = Optional
.ofNullable(
input
.getContext())
.map(
value -> value
.stream()
.map(c -> {
String communityId = c.getId();
if (communityId.contains("::")) {
communityId = communityId.substring(0, communityId.indexOf("::"));
}
if (communities.contains(communityId)) {
Context context = new Context();
context.setCode(communityId);
context.setLabel(communityMap.get(communityId));
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
if (dataInfo.isPresent()) {
List<Provenance> provenance = new ArrayList<>();
provenance
.addAll(
dataInfo
.get()
.stream()
.map(
di -> Optional
.ofNullable(di.getProvenanceaction())
.map(
provenanceaction -> Provenance
.newInstance(
provenanceaction.getClassname(),
di.getTrust()))
.orElse(null))
.filter(Objects::nonNull)
.collect(Collectors.toSet()));
try {
context.setProvenance(getUniqueProvenance(provenance));
} catch (NoAvailableEntityTypeException e) {
e.printStackTrace();
}
}
return context;
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
if (!contextList.isEmpty()) {
Set<Integer> hashValue = new HashSet<>();
List<Context> remainigContext = new ArrayList<>();
contextList.forEach(c -> {
if (!hashValue.contains(c.hashCode())) {
remainigContext.add(c);
hashValue.add(c.hashCode());
}
});
out.setContext(remainigContext);
}
mapContext(communityMap, out, input);
} catch (ClassCastException cce) {
return null;
}
@ -319,6 +69,314 @@ public class ResultMapper implements Serializable {
}
private static void mapContext(Map<String, String> communityMap, Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Set<String> communities = communityMap.keySet();
List<Context> contextList = Optional
.ofNullable(
input
.getContext())
.map(
value -> value
.stream()
.map(c -> {
String communityId = c.getId();
if (communityId.contains("::")) {
communityId = communityId.substring(0, communityId.indexOf("::"));
}
if (communities.contains(communityId)) {
Context context = new Context();
context.setCode(communityId);
context.setLabel(communityMap.get(communityId));
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
if (dataInfo.isPresent()) {
List<Provenance> provenance = new ArrayList<>();
provenance
.addAll(
dataInfo
.get()
.stream()
.map(
di -> Optional
.ofNullable(di.getProvenanceaction())
.map(
provenanceaction -> Provenance
.newInstance(
provenanceaction.getClassname(),
di.getTrust()))
.orElse(null))
.filter(Objects::nonNull)
.collect(Collectors.toSet()));
try {
context.setProvenance(getUniqueProvenance(provenance));
} catch (NoAvailableEntityTypeException e) {
e.printStackTrace();
}
}
return context;
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
if (!contextList.isEmpty()) {
Set<Integer> hashValue = new HashSet<>();
List<Context> remainigContext = new ArrayList<>();
contextList.forEach(c -> {
if (!hashValue.contains(c.hashCode())) {
remainigContext.add(c);
hashValue.add(c.hashCode());
}
});
out.setContext(remainigContext);
}
}
private static void mapSubject(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getSubject()).isPresent()) {
out.setSubject(createSubjectMap(input));
out
.setKeywords(
input
.getSubject()
.stream()
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("keyword") &&
!s.getValue().equalsIgnoreCase("EOSC::RO-crate"))
.map(s -> s.getValue())
.collect(Collectors.toList()));
if (Optional.ofNullable(input.getEoscifguidelines()).isPresent()) {
out
.setEoscIF(
input
.getEoscifguidelines()
.stream()
.map(
eig -> EoscInteroperabilityFramework
.newInstance(
eig.getCode(), eig.getLabel(), eig.getUrl(),
eig.getSemanticRelation()))
.collect(Collectors.toList()));
}
}
}
private static void mapSource(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getSource())
.ifPresent(
value -> out.setSource(value.stream().map(Field::getValue).collect(Collectors.toList())));
}
private static void mapPublisher(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getPublisher()).isPresent()) {
out.setPublisher(input.getPublisher().getValue());
}
}
private static void mapAcceptanceDate(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getDateofacceptance()).isPresent()) {
out.setPublicationdate(input.getDateofacceptance().getValue());
}
}
private static void mapPid(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> out
.setPid(
value
.stream()
.map(
p -> ResultPid
.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
}
private static void mapTitle(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getTitle()).isPresent()) {
List<StructuredProperty> iTitle = input
.getTitle()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = input
.getTitle()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
}
private static void mapLastUpdateTimestamp(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getLastupdatetimestamp()).isPresent()) {
out.setLastupdatetimestamp(input.getLastupdatetimestamp());
}
}
private static void mapLamguage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getLanguage()).isPresent()) {
out
.setLanguage(
Language.newInstance(input.getLanguage().getClassid(), input.getLanguage().getClassname()));
}
}
private static void mapInstance(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional
.ofNullable(input.getInstance())
.isPresent()) {
out
.setInstance(
input
.getInstance()
.stream()
.map(i -> getCommunityInstance(i))
.collect(Collectors.toList()));
}
}
private static void mapOriginalId(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
out.setOriginalId(new ArrayList<>());
Optional
.ofNullable(input.getOriginalId())
.ifPresent(
v -> out
.setOriginalId(
input
.getOriginalId()
.stream()
.filter(s -> !s.startsWith("50|"))
.collect(Collectors.toList())));
}
private static void mapFormat(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
}
private static void mapMeasure(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getMeasures()).isPresent()) {
Indicator i = new Indicator();
UsageCounts uc = new UsageCounts();
input.getMeasures().forEach(m -> {
if (m.getId().equals("downloads")) {
uc.setDownloads(m.getUnit().get(0).getValue());
}
if (m.getId().equals("views")) {
uc.setViews(m.getUnit().get(0).getValue());
}
});
if (!uc.isEmpty()) {
i.setUsageCounts(uc);
out.setIndicator(i);
}
}
}
private static void mapEmbrargo(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getEmbargoenddate()).isPresent()) {
out.setEmbargoenddate(input.getEmbargoenddate().getValue());
}
}
private static void mapDescription(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
}
private static void mapCoverage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
}
private static void mapCountry(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
ResultCountry country = new ResultCountry();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
}
private static void mapContributor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
}
private static void mapAccessRight(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
// I do not map Access Right UNKNOWN or OTHER
Optional<Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(oar.get().getClassid())) {
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(oar.get().getClassid());
out
.setBestaccessright(
BestAccessRight
.newInstance(
code,
Constants.COAR_CODE_LABEL_MAP.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
private static void mapAuthor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getAuthor())
.ifPresent(
ats -> out.setAuthor(ats.stream().map(ResultMapper::getAuthor).collect(Collectors.toList())));
}
private static Map<String, List<eu.dnetlib.dhp.eosc.model.Subject>> createSubjectMap(
eu.dnetlib.dhp.schema.oaf.Result input) {
Map<String, List<eu.dnetlib.dhp.eosc.model.Subject>> map = new HashMap<>();
@ -345,8 +403,8 @@ public class ResultMapper implements Serializable {
}
private static void addTypeSpecificInformation(Result out, eu.dnetlib.dhp.schema.oaf.Result input,
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort) throws NoAvailableEntityTypeException {
switch (ort.get().getClassid()) {
eu.dnetlib.dhp.schema.oaf.Qualifier ort) throws NoAvailableEntityTypeException {
switch (ort.getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((Publication) input).getJournal());

View File

@ -25,7 +25,7 @@ public class SendToZenodoHDFS implements Serializable {
.toString(
SendToZenodoHDFS.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/upload_zenodo.json")));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_upload_zenodo.json")));
parser.parseArgument(args);

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
public class MergedRels implements Serializable {
private String organizationId;
private String representativeId;
public String getOrganizationId() {
return organizationId;
}
public void setOrganizationId(String organizationId) {
this.organizationId = organizationId;
}
public String getRepresentativeId() {
return representativeId;
}
public void setRepresentativeId(String representativeId) {
this.representativeId = representativeId;
}
}

View File

@ -57,7 +57,7 @@ public class SaveCommunityMap implements Serializable {
.toString(
SaveCommunityMap.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_cm_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_cm_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -35,7 +35,7 @@ public class SparkExtendResultWithRelation implements Serializable {
.toString(
SparkExtendResultWithRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_extendwithrelation_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_input_extendwithrelation_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -49,7 +49,7 @@ public class SparkPrepareResultProject implements Serializable {
.toString(
SparkPrepareResultProject.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_prep_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_project_prep_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -36,7 +36,7 @@ public class SparkSelectRelation implements Serializable {
.toString(
SparkSelectRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_input_relationdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -33,7 +33,7 @@ public class SparkUpdateProjectInfo implements Serializable {
.toString(
SparkUpdateProjectInfo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/eosc_project_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,26 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "gp",
"paramLongName": "graphPath",
"paramDescription": "the relationPath",
"paramRequired": false
}
]

View File

@ -1,30 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "ra",
"paramLongName": "resultAggregation",
"paramDescription": "true if all the result type should be saved under the generic result name. false to get a different dump for each result type",
"paramRequired": true
}
]

View File

@ -1,30 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
}
]

View File

@ -1,24 +0,0 @@
[
{
"paramName":"is",
"paramLongName":"isLookUpUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": false
},
{
"paramName": "hdfs",
"paramLongName": "hdfsPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "the name node",
"paramRequired": true
}
]

View File

@ -1,36 +0,0 @@
[
{
"paramName":"ocm",
"paramLongName":"organizationCommunityMap",
"paramDescription": "the organization community map association",
"paramRequired": false
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the serialization of the community map",
"paramRequired": true
}
]

View File

@ -1,20 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -1,47 +0,0 @@
[
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the serialization of the community map",
"paramRequired": true
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"dt",
"paramLongName":"dumpType",
"paramDescription": "the type of the dump (complete for the whole graph, community for the products related to communities, funder for the results with at least a link to project",
"paramRequired": false
}, {
"paramName":"cid",
"paramLongName":"communityId",
"paramDescription": "the id of the community to be dumped",
"paramRequired": false
}
]

View File

@ -1,41 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"gp",
"paramLongName":"graphPath",
"paramDescription": "the path to the relations",
"paramRequired": true
},
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the relations",
"paramRequired": true
}
]

View File

@ -1,27 +0,0 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "pl",
"paramLongName": "projectListPath",
"paramDescription": "the path of the association result projectlist",
"paramRequired": true
}
]

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,171 +0,0 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>projectListPath</name>
<description>the path to the project list</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>accessToken</name>
<description>the access token used for the deposition in Zenodo</description>
</property>
<property>
<name>connectionUrl</name>
<description>the connection url for Zenodo</description>
</property>
<property>
<name>metadata</name>
<description> the metadata associated to the deposition</description>
</property>
<property>
<name>depositionType</name>
<description>the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided)</description>
</property>
<property>
<name>conceptRecordId</name>
<description>for new version, the id of the record for the old deposition</description>
</property>
<property>
<name>depositionId</name>
<description>the depositionId of a deposition open that has to be added content</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="dump_project"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="dump_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
<arg>--communityMapPath</arg><arg>noneed</arg>
</spark>
<ok to="get_new_projects"/>
<error to="Kill"/>
</action>
<action name="get_new_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/project</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/project</arg>
<arg>--projectListPath</arg><arg>${projectListPath}</arg>
</spark>
<ok to="make_archive"/>
<error to="Kill"/>
</action>
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.eosc.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
</java>
<ok to="send_zenodo"/>
<error to="Kill"/>
</action>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
<arg>--metadata</arg><arg>${metadata}</arg>
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
<arg>--depositionType</arg><arg>${depositionType}</arg>
<arg>--depositionId</arg><arg>${depositionId}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,37 +0,0 @@
[
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the serialization of the community map",
"paramRequired": false
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}, {
"paramName":"cid",
"paramLongName":"communityId",
"paramDescription": "the id of the community to be dumped",
"paramRequired": false
}
]

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,4 +0,0 @@
## This is a classpath-based import file (this header is required)
dump_complete classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/complete/oozie_app
dump_funder classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app
dump_community classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app

View File

@ -1,306 +0,0 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>singleDeposition</name>
<description>Indicates if it is a single community deposition</description>
</property>
<property>
<name>communityId</name>
<description>the id of the community to be dumped if a dump for a single community should be done</description>
</property>
<property>
<name>dumpType</name>
<description>the type of the dump one of {complete, community, funder}</description>
</property>
<property>
<name>onlyUpload</name>
<description>true if the dump is already done and should only be upload in zenodo</description>
</property>
<property>
<name>upload</name>
<description>true if the dump should be upload in zenodo</description>
</property>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>isLookUpUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>resultAggregation</name>
<description>true if all the result type have to be dumped under result. false otherwise</description>
</property>
<property>
<name>accessToken</name>
<description>the access token used for the deposition in Zenodo</description>
</property>
<property>
<name>connectionUrl</name>
<description>the connection url for Zenodo</description>
</property>
<property>
<name>metadata</name>
<description> the metadata associated to the deposition</description>
</property>
<property>
<name>depositionType</name>
<description>the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided)</description>
</property>
<property>
<name>conceptRecordId</name>
<description>for new version, the id of the record for the old deposition</description>
</property>
<property>
<name>depositionId</name>
<description>the depositionId of a deposition open that has to be added content</description>
</property>
<property>
<name>organizationCommunityMap</name>
<description>the organization community map</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="only_upload"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="only_upload">
<switch>
<case to="send_zenodo">${wf:conf('onlyUpload') eq true}</case>
<default to="reset_outputpath"/>
</switch>
</decision>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="save_community_map"/>
<error to="Kill"/>
</action>
<action name="save_community_map">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.eosc.SaveCommunityMap</main-class>
<arg>--outputPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--singleDeposition</arg><arg>${singleDeposition}</arg>
<arg>--communityId</arg><arg>${communityId}</arg>
</java>
<ok to="choose_dump"/>
<error to="Kill"/>
</action>
<decision name="choose_dump">
<switch>
<case to="dump_funder">${wf:conf('dumpType') eq "funder"}</case>
<case to="dump_community">${wf:conf('dumpType') eq "community"}</case>
<default to="dump_complete"/>
</switch>
</decision>
<!-- Sub-workflow which runs the dump for the complete graph -->
<action name="dump_complete">
<sub-workflow>
<app-path>${wf:appPath()}/dump_complete
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>communityMapPath</name>
<value>${workingDir}/communityMap</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
</property>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>organizationCommunityMap</name>
<value>${organizationCommunityMap}</value>
</property>
<property>
<name>isLookUpUrl</name>
<value>${isLookUpUrl}</value>
</property>
<property>
<name>resultAggregation</name>
<value>${resultAggregation}</value>
</property>
</configuration>
</sub-workflow>
<ok to="make_archive" />
<error to="Kill" />
</action>
<!-- Sub-workflow which runs the dump for the complete graph -->
<action name="dump_community">
<sub-workflow>
<app-path>${wf:appPath()}/dump_community
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>communityMapPath</name>
<value>${workingDir}/communityMap</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
</property>
</configuration>
</sub-workflow>
<ok to="make_archive" />
<error to="Kill" />
</action>
<action name="dump_funder">
<sub-workflow>
<app-path>${wf:appPath()}/dump_funder
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>communityMapPath</name>
<value>${workingDir}/communityMap</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/tar</value>
</property>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>dumpType</name>
<value>${dumpType}</value>
</property>
</configuration>
</sub-workflow>
<ok to="make_archive" />
<error to="Kill" />
</action>
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.eosc.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
</java>
<ok to="should_upload"/>
<error to="Kill"/>
</action>
<decision name="should_upload">
<switch>
<case to="send_zenodo">${wf:conf('upload') eq true}</case>
<default to="End"/>
</switch>
</decision>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
<arg>--metadata</arg><arg>${metadata}</arg>
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
<arg>--depositionType</arg><arg>${depositionType}</arg>
<arg>--depositionId</arg><arg>${depositionId}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,362 +0,0 @@
<workflow-app name="sub_dump_community_products" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="fork_dump"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="fork_dump">
<path start="dump_publication"/>
<path start="dump_dataset"/>
<path start="dump_orp"/>
<path start="dump_software"/>
</fork>
<action name="dump_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table publication for community/funder related products</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
<arg>--dumpType</arg><arg>${dumpType}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table dataset for community/funder related products</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table ORP for community related products</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table software for community related products</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/software</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<join name="join_dump" to="prepareResultProject"/>
<action name="prepareResultProject">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Prepare association result subset of project info</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkPrepareResultProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="fork_extendWithProject"/>
<error to="Kill"/>
</action>
<fork name="fork_extendWithProject">
<path start="extend_publication"/>
<path start="extend_dataset"/>
<path start="extend_orp"/>
<path start="extend_software"/>
</fork>
<action name="extend_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend dumped publications with information about project</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkUpdateProjectInfo</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/ext/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<action name="extend_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend dumped dataset with information about project</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkUpdateProjectInfo</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/ext/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<action name="extend_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend dumped ORP with information about project</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkUpdateProjectInfo</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/ext/orp</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<action name="extend_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend dumped software with information about project</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkUpdateProjectInfo</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/ext/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<join name="join_extend" to="splitForCommunities"/>
<action name="splitForCommunities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Split dumped result for community</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,539 +0,0 @@
<workflow-app name="sub-dump_complete" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>resultAggregation</name>
<description>true if all the result type have to be dumped under result. false otherwise</description>
</property>
<property>
<name>organizationCommunityMap</name>
<description>the organization community map</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="fork_dump" />
<fork name="fork_dump">
<path start="dump_publication"/>
<path start="dump_dataset"/>
<path start="dump_orp"/>
<path start="dump_software"/>
<path start="dump_organization"/>
<path start="dump_project"/>
<path start="dump_datasource"/>
<path start="select_relation"/>
</fork>
<action name="dump_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/publication</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/dataset</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/software</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table organization </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/organization</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--outputPath</arg><arg>${outputPath}/organization</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${outputPath}/project</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="dump_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table datasource </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/datasource</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--outputPath</arg><arg>${outputPath}/datasource</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="select_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Select valid table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/validrelation</arg>
</spark>
<ok to="dump_relation"/>
<error to="Kill"/>
</action>
<action name="dump_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/validrelation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/relation</arg>
<arg>--removeSet</arg><arg>${removeSet}</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<join name="join_dump" to="fork_context"/>
<fork name="fork_context">
<path start="create_entities_fromcontext"/>
<path start="create_relation_fromcontext"/>
<path start="create_relation_fromorgs"/>
</fork>
<action name="create_entities_fromcontext">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}/communities_infrastructures/communities_infrastructure.json.gz</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</java>
<ok to="join_context"/>
<error to="Kill"/>
</action>
<action name="create_relation_fromcontext">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/relation/context</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</java>
<ok to="join_context"/>
<error to="Kill"/>
</action>
<action name="create_relation_fromorgs">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/contextOrg</arg>
<arg>--organizationCommunityMap</arg><arg>${organizationCommunityMap}</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_context"/>
<error to="Kill"/>
</action>
<join name="join_context" to="fork_extract_relations"/>
<fork name="fork_extract_relations">
<path start="rels_from_pubs"/>
<path start="rels_from_dats"/>
<path start="rels_from_orp"/>
<path start="rels_from_sw"/>
</fork>
<action name="rels_from_pubs">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extract Relations from publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/publication</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_extract_relations"/>
<error to="Kill"/>
</action>
<action name="rels_from_dats">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/dataset</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_extract_relations"/>
<error to="Kill"/>
</action>
<action name="rels_from_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/orp</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_extract_relations"/>
<error to="Kill"/>
</action>
<action name="rels_from_sw">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/software</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_extract_relations"/>
<error to="Kill"/>
</action>
<join name="join_extract_relations" to="collect_and_save"/>
<action name="collect_and_save">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Collect Results and Relations and put them in the right path </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--resultAggregation</arg><arg>${resultAggregation}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<kill name="Kill">
<message>Sub-workflow dump complete failed with error message ${wf:errorMessage()}
</message>
</kill>
<end name="End" />
</workflow-app>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,255 +0,0 @@
<workflow-app name="sub_dump_funder_results" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepareResultProject"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepareResultProject">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Prepare association result subset of project info</name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.SparkPrepareResultProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="fork_result_linked_to_projects"/>
<error to="Kill"/>
</action>
<fork name="fork_result_linked_to_projects">
<path start="select_publication_linked_to_projects"/>
<path start="select_dataset_linked_to_projects"/>
<path start="select_orp_linked_to_project"/>
<path start="select_software_linked_to_projects"/>
</fork>
<action name="select_publication_linked_to_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump funder results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/publication</arg>
<arg>--graphPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_link"/>
<error to="Kill"/>
</action>
<action name="select_dataset_linked_to_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump funder results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/dataset</arg>
<arg>--graphPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_link"/>
<error to="Kill"/>
</action>
<action name="select_orp_linked_to_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump funder results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
<arg>--graphPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_link"/>
<error to="Kill"/>
</action>
<action name="select_software_linked_to_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump funder results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/result/software</arg>
<arg>--graphPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
</spark>
<ok to="join_link"/>
<error to="Kill"/>
</action>
<join name="join_link" to="dump_funder_results"/>
<action name="dump_funder_results">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump funder results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkDumpFunderResults</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/result</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>