forked from D-Net/dnet-hadoop
Add collecting software code repository URLs
This commit is contained in:
parent
8a6892cc63
commit
9d44418d38
|
@ -0,0 +1,104 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<artifactId>dhp-swh</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>net.sf.saxon</groupId>
|
||||||
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>dom4j</groupId>
|
||||||
|
<artifactId>dom4j</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>xml-apis</groupId>
|
||||||
|
<artifactId>xml-apis</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>jaxen</groupId>
|
||||||
|
<artifactId>jaxen</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-distcp</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-actionmanager-api</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-actionmanager-common</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>saxonica</groupId>
|
||||||
|
<artifactId>saxon</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>saxonica</groupId>
|
||||||
|
<artifactId>saxon-dom</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>jgrapht</groupId>
|
||||||
|
<artifactId>jgrapht</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>net.sf.ehcache</groupId>
|
||||||
|
<artifactId>ehcache</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.springframework</groupId>
|
||||||
|
<artifactId>spring-test</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.apache.*</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>apache</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
<version>4.5.13</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
|
@ -0,0 +1,211 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.http.Header;
|
||||||
|
import org.apache.http.HttpEntity;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.http.util.EntityUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.types.DataTypes;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates action sets for Crossref affiliation relations inferred by BIP!
|
||||||
|
*/
|
||||||
|
public class CollectSoftwareRepositoryURLs implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CollectSoftwareRepositoryURLs.class);
|
||||||
|
// public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
|
||||||
|
// public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
|
||||||
|
// public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
|
||||||
|
private static final String DEFAULT_VISIT_TYPE = "git";
|
||||||
|
private static final int CONCURRENT_API_CALLS = 1;
|
||||||
|
|
||||||
|
private static final String SWH_LATEST_VISIT_URL = "https://archive.softwareheritage.org/api/1/origin/%s/visit/latest/";
|
||||||
|
|
||||||
|
public static <I extends Result> void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
CollectSoftwareRepositoryURLs.class
|
||||||
|
.getResourceAsStream("/eu/dnetlib/dhp/swh/input_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String hiveDbName = parser.get("hiveDbName");
|
||||||
|
log.info("hiveDbName {}: ", hiveDbName);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("softwareCodeRepositoryURLs");
|
||||||
|
log.info("softwareCodeRepositoryURLs {}: ", outputPath);
|
||||||
|
|
||||||
|
final String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
||||||
|
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
doRun(spark, hiveDbName, outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, String outputPath) {
|
||||||
|
|
||||||
|
String queryTemplate = "SELECT distinct coderepositoryurl.value " +
|
||||||
|
"FROM %s.software " +
|
||||||
|
"WHERE coderepositoryurl.value IS NOT NULL";
|
||||||
|
String query = String.format(queryTemplate, hiveDbName);
|
||||||
|
|
||||||
|
log.info("Hive query to fetch software code URLs: {}", query);
|
||||||
|
|
||||||
|
Dataset<Row> df = spark.sql(query);
|
||||||
|
|
||||||
|
// write distinct repository URLs
|
||||||
|
df
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
.csv(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> readSoftware(SparkSession spark, String inputPath) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.json(inputPath)
|
||||||
|
.select(
|
||||||
|
new Column("codeRepositoryUrl.value").as("codeRepositoryUrl"),
|
||||||
|
new Column("dataInfo.deletedbyinference"),
|
||||||
|
new Column("dataInfo.invisible"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> filterSoftware(Dataset<Row> softwareDF, Integer limit) {
|
||||||
|
|
||||||
|
Dataset<Row> df = softwareDF
|
||||||
|
.where(softwareDF.col("codeRepositoryUrl").isNotNull())
|
||||||
|
.where("deletedbyinference = false")
|
||||||
|
.where("invisible = false")
|
||||||
|
.drop("deletedbyinference")
|
||||||
|
.drop("invisible");
|
||||||
|
|
||||||
|
// TODO remove when done
|
||||||
|
df = df.limit(limit);
|
||||||
|
|
||||||
|
return df;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Dataset<Row> makeParallelRequests(SparkSession spark, Dataset<Row> softwareDF) {
|
||||||
|
// TODO replace with coalesce ?
|
||||||
|
Dataset<Row> df = softwareDF.repartition(CONCURRENT_API_CALLS);
|
||||||
|
|
||||||
|
log.info("Number of partitions: {}", df.rdd().getNumPartitions());
|
||||||
|
|
||||||
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
|
|
||||||
|
List<Row> collectedRows = df
|
||||||
|
.javaRDD()
|
||||||
|
// max parallelism should be equal to the number of partitions here
|
||||||
|
.mapPartitions((FlatMapFunction<Iterator<Row>, Row>) partition -> {
|
||||||
|
List<Row> resultRows = new ArrayList<>();
|
||||||
|
while (partition.hasNext()) {
|
||||||
|
Row row = partition.next();
|
||||||
|
String url = String.format(SWH_LATEST_VISIT_URL, row.getString(0));
|
||||||
|
|
||||||
|
// String snapshotId = null;
|
||||||
|
// String type = null;
|
||||||
|
// String date = null;
|
||||||
|
|
||||||
|
String responseBody = makeAPICall(url);
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
// Thread.sleep(500);
|
||||||
|
// if (responseBody != null) {
|
||||||
|
// LastVisitResponse visitResponse = objectMapper.readValue(responseBody, LastVisitResponse.class);
|
||||||
|
// snapshotId = visitResponse.getSnapshot();
|
||||||
|
// type = visitResponse.getType();
|
||||||
|
// date = visitResponse.getDate();
|
||||||
|
// }
|
||||||
|
// resultRows.add(RowFactory.create(url, snapshotId, type, date));
|
||||||
|
|
||||||
|
resultRows.add(RowFactory.create(url, responseBody));
|
||||||
|
}
|
||||||
|
return resultRows.iterator();
|
||||||
|
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
StructType resultSchema = new StructType()
|
||||||
|
.add("codeRepositoryUrl", DataTypes.StringType)
|
||||||
|
.add("response", DataTypes.StringType);
|
||||||
|
|
||||||
|
// .add("snapshotId", DataTypes.StringType)
|
||||||
|
// .add("type", DataTypes.StringType)
|
||||||
|
// .add("date", DataTypes.StringType);
|
||||||
|
|
||||||
|
// create a DataFrame from the collected rows
|
||||||
|
return spark.createDataFrame(collectedRows, resultSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String makeAPICall(String url) throws IOException {
|
||||||
|
System.out.println(java.time.LocalDateTime.now());
|
||||||
|
|
||||||
|
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
|
||||||
|
HttpGet httpGet = new HttpGet(url);
|
||||||
|
httpGet
|
||||||
|
.setHeader(
|
||||||
|
"Authorization",
|
||||||
|
"Bearer eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs");
|
||||||
|
try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
|
||||||
|
int statusCode = response.getStatusLine().getStatusCode();
|
||||||
|
// if (statusCode != 200)
|
||||||
|
// return null;
|
||||||
|
Header[] headers = response.getHeaders("X-RateLimit-Remaining");
|
||||||
|
for (Header header : headers) {
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
"Key : " + header.getName()
|
||||||
|
+ " ,Value : " + header.getValue());
|
||||||
|
}
|
||||||
|
HttpEntity entity = response.getEntity();
|
||||||
|
if (entity != null) {
|
||||||
|
return EntityUtils.toString(entity);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.swh.models;
|
||||||
|
|
||||||
|
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
|
|
||||||
|
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||||
|
public class LastVisitResponse {
|
||||||
|
|
||||||
|
private String type;
|
||||||
|
|
||||||
|
private String date;
|
||||||
|
|
||||||
|
@JsonProperty("snapshot")
|
||||||
|
private String snapshotId;
|
||||||
|
|
||||||
|
public String getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDate() {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDate(String date) {
|
||||||
|
this.date = date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSnapshot() {
|
||||||
|
return snapshotId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSnapshot(String snapshotId) {
|
||||||
|
this.snapshotId = snapshotId;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ip",
|
||||||
|
"paramLongName": "softwareCodeRepositoryURLs",
|
||||||
|
"paramDescription": "the URL where to store software repository URLs",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "db",
|
||||||
|
"paramLongName": "hiveDbName",
|
||||||
|
"paramDescription": "the target hive database name",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hmu",
|
||||||
|
"paramLongName": "hiveMetastoreUris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,25 @@
|
||||||
|
# hive
|
||||||
|
hiveDbName=openaire_prod_20230914
|
||||||
|
hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
|
||||||
|
|
||||||
|
# oozie
|
||||||
|
oozie.action.sharelib.for.spark=spark2
|
||||||
|
oozie.use.system.libpath=true
|
||||||
|
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||||
|
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||||
|
oozieActionShareLibForSpark2=spark2
|
||||||
|
|
||||||
|
# spark
|
||||||
|
spark2EventLogDir=/user/spark/spark2ApplicationHistory
|
||||||
|
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
||||||
|
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
||||||
|
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
|
||||||
|
sparkSqlWarehouseDir=/user/hive/warehouse
|
||||||
|
|
||||||
|
# misc
|
||||||
|
wfAppPath=${oozieTopWfApplicationPath}
|
||||||
|
resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
|
||||||
|
|
||||||
|
# custom params
|
||||||
|
softwareCodeRepositoryURLs=${workingDir}/code_repo_urls.csv
|
||||||
|
resume=collect-software-repository-urls
|
|
@ -0,0 +1,101 @@
|
||||||
|
<workflow-app name="Software-Heritage-Integration-Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<!-- <parameters>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>apiDescription</name>-->
|
||||||
|
<!-- <description>A json encoding of the API Description class</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>dataSourceInfo</name>-->
|
||||||
|
<!-- <description>A json encoding of the Datasource Info</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>identifierPath</name>-->
|
||||||
|
<!-- <description>An xpath to retrieve the metadata identifier for the generation of DNet Identifier </description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>metadataEncoding</name>-->
|
||||||
|
<!-- <description> The type of the metadata XML/JSON</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>timestamp</name>-->
|
||||||
|
<!-- <description>The timestamp of the collection date</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>workflowId</name>-->
|
||||||
|
<!-- <description>The identifier of the workflow</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>mdStoreID</name>-->
|
||||||
|
<!-- <description>The identifier of the mdStore</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>mdStoreManagerURI</name>-->
|
||||||
|
<!-- <description>The URI of the MDStore Manager</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>dnetMessageManagerURL</name>-->
|
||||||
|
<!-- <description>The URI of the Dnet Message Manager</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>collectionMode</name>-->
|
||||||
|
<!-- <description>Should be REFRESH or INCREMENTAL</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>collection_java_xmx</name>-->
|
||||||
|
<!-- <value>-Xmx200m</value>-->
|
||||||
|
<!-- <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
|
||||||
|
|
||||||
|
<!-- </parameters>-->
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="startFrom"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<decision name="startFrom">
|
||||||
|
<switch>
|
||||||
|
<case to="collect-software-repository-urls">${wf:conf('startFrom') eq 'collect-software-repository-urls'}</case>
|
||||||
|
<default to="collect-software-repository-urls"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="collect-software-repository-urls">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Collect software repository URLs</name>
|
||||||
|
<class>eu.dnetlib.dhp.swh.CollectSoftwareRepositoryURLs</class>
|
||||||
|
<jar>dhp-swh-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -39,6 +39,7 @@
|
||||||
<module>dhp-broker-events</module>
|
<module>dhp-broker-events</module>
|
||||||
<module>dhp-doiboost</module>
|
<module>dhp-doiboost</module>
|
||||||
<module>dhp-impact-indicators</module>
|
<module>dhp-impact-indicators</module>
|
||||||
|
<module>dhp-swh</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<pluginRepositories>
|
<pluginRepositories>
|
||||||
|
|
Loading…
Reference in New Issue