diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java
index 55f9ceb8b..d26d9c0e9 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java
@@ -53,9 +53,9 @@ public class HttpClientParams {
*/
private String requestMethod;
-
public HttpClientParams() {
- this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod);
+ this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
+ _requestMethod);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java
index 7b3b74d9e..38db27baf 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java
@@ -1,14 +1,16 @@
package eu.dnetlib.dhp.swh;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.collection.CollectorException;
-import eu.dnetlib.dhp.common.collection.HttpClientParams;
-import eu.dnetlib.dhp.swh.models.LastVisitData;
-import eu.dnetlib.dhp.swh.utils.SWHConnection;
-import eu.dnetlib.dhp.swh.utils.SWHConstants;
-import eu.dnetlib.dhp.swh.utils.SWHUtils;
+import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
+import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
@@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.net.URL;
-import java.util.Date;
-import java.util.Optional;
-import java.util.concurrent.TimeUnit;
+import com.fasterxml.jackson.databind.ObjectMapper;
-import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
-import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+import eu.dnetlib.dhp.common.collection.HttpClientParams;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
+import eu.dnetlib.dhp.swh.models.LastVisitData;
+import eu.dnetlib.dhp.swh.utils.SWHConnection;
+import eu.dnetlib.dhp.swh.utils.SWHConstants;
+import eu.dnetlib.dhp.swh.utils.SWHUtils;
/**
* Sends archive requests to the SWH API for those software repository URLs that are missing from them
@@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs {
}
- private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException {
+ private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays)
+ throws IOException {
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
@@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs {
// Read key-value pairs from the SequenceFile and handle appropriately
while (fr.next(repoUrl, lastVisitData)) {
- String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
+ String response = null;
+ try {
+ response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
+ } catch (java.text.ParseException e) {
+ log.error("Could not handle record with repo Url: {}", repoUrl.toString());
+ throw new RuntimeException(e);
+ }
// response is equal to null when no need for request
if (response != null) {
@@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs {
fr.close();
}
- public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException {
- System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData);
+ public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays)
+ throws IOException, java.text.ParseException {
+
+ log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData);
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
- // perform an archive request when no repoUrl was not found in previous step
+ // a previous attempt for archival has been made, and repository URL was not found
+ // avoid performing the same archive request again
+ if (lastVisit.getType() != null &&
+ lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) {
+
+ log.info("Avoid request -- previous archive request returned NOT_FOUND");
+ return null;
+ }
+
+ // if we have last visit data
if (lastVisit.getSnapshot() != null) {
- // OR last visit was before (now() - archiveThresholdInDays)
- long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime());
- long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
+ String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate());
- if (archiveThresholdInDays >= diffInDays) {
- return null;
+ // and the last visit date can be parsed
+ if (cleanDate != null) {
+
+ SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT);
+ Date lastVisitDate = formatter.parse(cleanDate);
+
+ // OR last visit time < (now() - archiveThresholdInDays)
+ long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime());
+ long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
+ log.info("Date diff from now (in days): {}", diffInDays);
+
+ // do not perform a request, if the last visit date is no older than $archiveThresholdInDays
+ if (archiveThresholdInDays >= diffInDays) {
+ log.info("Avoid request -- no older than {} days", archiveThresholdInDays);
+ return null;
+ }
}
}
- // if last visit data are available, re-use version control type, else use the default one (i.e., git)
+ // ELSE perform an archive request
+ log.info("Perform archive request for: {}", repoUrl);
+
+ // if last visit data are available, re-use version control type,
+ // else use the default one (i.e., git)
String visitType = Optional
.ofNullable(lastVisit.getType())
.orElse(SWHConstants.DEFAULT_VISIT_TYPE);
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
- System.out.println(url.toString());
+
+ log.info("Sending archive request: {}", url);
String response;
try {
response = swhConnection.call(url.toString());
} catch (CollectorException e) {
- log.info("Error in request: {}", url);
+ log.error("Error in request: {}", url);
response = "{}";
}
return response;
-
}
-
-
}
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java
index c4b6412b5..9386b6876 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java
@@ -1,12 +1,15 @@
package eu.dnetlib.dhp.swh;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.collection.CollectorException;
-import eu.dnetlib.dhp.common.collection.HttpClientParams;
-import eu.dnetlib.dhp.swh.utils.SWHConnection;
-import eu.dnetlib.dhp.swh.utils.SWHConstants;
-import eu.dnetlib.dhp.swh.utils.SWHUtils;
+import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.nio.charset.StandardCharsets;
-
-import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+import eu.dnetlib.dhp.common.collection.HttpClientParams;
+import eu.dnetlib.dhp.swh.utils.SWHConnection;
+import eu.dnetlib.dhp.swh.utils.SWHConstants;
+import eu.dnetlib.dhp.swh.utils.SWHUtils;
/**
* Given a file with software repository URLs, this class
@@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData {
try {
response = swhConnection.call(url.toString());
} catch (CollectorException e) {
- log.info("Error in request: {}", url);
+ log.error("Error in request: {}", url);
response = "{}";
}
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java
index f93280b5e..c1a0fafa5 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java
@@ -1,8 +1,11 @@
package eu.dnetlib.dhp.swh;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Result;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.io.Serializable;
+import java.util.Optional;
+
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
@@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.Serializable;
-import java.util.Optional;
-
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Collects unique software repository URLs in the Graph using Hive
@@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable {
"WHERE coderepositoryurl.value IS NOT NULL " +
"AND datainfo.deletedbyinference = FALSE " +
"AND datainfo.invisible = FALSE " +
- "LIMIT 1000"; // TODO remove
+ "LIMIT 1000";
String query = String.format(queryTemplate, hiveDbName);
log.info("Hive query to fetch software code URLs: {}", query);
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
index b8cd6de6e..eaff5ce02 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
@@ -1,21 +1,23 @@
package eu.dnetlib.dhp.swh.models;
+import java.util.Date;
+
+import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat;
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
-import java.util.Date;
-
@JsonIgnoreProperties(ignoreUnknown = true)
public class LastVisitData {
private String type;
-
- private Date date;
+ private String date;
@JsonProperty("snapshot")
private String snapshotId;
+ private String status;
+
public String getType() {
return type;
}
@@ -24,11 +26,11 @@ public class LastVisitData {
this.type = type;
}
- public Date getDate() {
+ public String getDate() {
return date;
}
- public void setDate(Date date) {
+ public void setDate(String date) {
this.date = date;
}
@@ -39,4 +41,12 @@ public class LastVisitData {
public void setSnapshot(String snapshotId) {
this.snapshotId = snapshotId;
}
+
+ public String getStatus() {
+ return status;
+ }
+
+ public void setStatus(String status) {
+ this.status = status;
+ }
}
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java
index 1299bc805..f58705188 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java
@@ -6,8 +6,10 @@ public class SWHConstants {
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
- public static final String ACCESS_TOKEN = "";
+ public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs";
public static final String DEFAULT_VISIT_TYPE = "git";
+ public static final String VISIT_STATUS_NOT_FOUND = "not_found";
+
}
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java
index 8200e7b34..405ce51e4 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java
@@ -1,8 +1,14 @@
package eu.dnetlib.dhp.swh.utils;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.common.collection.HttpClientParams;
+import static eu.dnetlib.dhp.common.Constants.*;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Optional;
+
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Optional;
-
-import static eu.dnetlib.dhp.common.Constants.*;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class SWHUtils {
@@ -51,10 +52,10 @@ public class SWHUtils {
log.info("retryDelay is {}", clientParams.getRetryDelay());
clientParams
- .setRequestMethod(
- Optional
- .ofNullable(argumentParser.get(REQUEST_METHOD))
- .orElse(HttpClientParams._requestMethod));
+ .setRequestMethod(
+ Optional
+ .ofNullable(argumentParser.get(REQUEST_METHOD))
+ .orElse(HttpClientParams._requestMethod));
log.info("requestMethod is {}", clientParams.getRequestMethod());
return clientParams;
@@ -63,16 +64,16 @@ public class SWHUtils {
public static BufferedReader getFileReader(FileSystem fs, Path inputPath) throws IOException {
FSDataInputStream inputStream = fs.open(inputPath);
return new BufferedReader(
- new InputStreamReader(inputStream, StandardCharsets.UTF_8));
+ new InputStreamReader(inputStream, StandardCharsets.UTF_8));
}
public static SequenceFile.Writer getSequenceFileWriter(FileSystem fs, String outputPath) throws IOException {
return SequenceFile
- .createWriter(
- fs.getConf(),
- SequenceFile.Writer.file(new Path(outputPath)),
- SequenceFile.Writer.keyClass(Text.class),
- SequenceFile.Writer.valueClass(Text.class));
+ .createWriter(
+ fs.getConf(),
+ SequenceFile.Writer.file(new Path(outputPath)),
+ SequenceFile.Writer.keyClass(Text.class),
+ SequenceFile.Writer.valueClass(Text.class));
}
public static SequenceFile.Reader getSequenceFileReader(FileSystem fs, String inputPath) throws IOException {
diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json
index 5ec481305..ce80d6f4a 100644
--- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json
+++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json
@@ -11,12 +11,36 @@
"paramDescription": "the URL where to store last visits data",
"paramRequired": true
},
+ {
+ "paramName": "arp",
+ "paramLongName": "archiveRequestsPath",
+ "paramDescription": "the URL where to store the responses of the archive requests",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mnr",
+ "paramLongName": "maxNumberOfRetry",
+ "paramDescription": "the maximum number of admitted connection retries",
+ "paramRequired": false
+ },
{
"paramName": "rqd",
"paramLongName": "requestDelay",
"paramDescription": "the delay (ms) between requests",
"paramRequired": false
},
+ {
+ "paramName": "rtd",
+ "paramLongName": "retryDelay",
+ "paramDescription": "the delay (ms) between retries",
+ "paramRequired": false
+ },
+ {
+ "paramName": "rm",
+ "paramLongName": "requestMethod",
+ "paramDescription": "the method of the requests to perform",
+ "paramRequired": false
+ },
{
"paramName": "atid",
"paramLongName": "archiveThresholdInDays",
diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json
index 6c59123be..8bf41f0ae 100644
--- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json
+++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json
@@ -34,5 +34,11 @@
"paramLongName": "retryDelay",
"paramDescription": "the delay (ms) between retries",
"paramRequired": false
+ },
+ {
+ "paramName": "rm",
+ "paramLongName": "requestMethod",
+ "paramDescription": "the method of the requests to perform",
+ "paramRequired": false
}
]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties
index e2c2af852..4cc1c1e25 100644
--- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties
+++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties
@@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv
lastVisitsPath=${workingDir}/2_last_visits.seq
archiveRequestsPath=${workingDir}/3_archive_requests.seq
+maxNumberOfRetry=2
+retryDelay=1
+requestDelay=100
+
resume=collect-software-repository-urls
diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml
index 5062d562b..b89165fa2 100644
--- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml
@@ -8,7 +8,27 @@
softwareCodeRepositoryURLs
- The path in the HDSF to save the software repository URLs
+ The path in the HDFS to save the software repository URLs
+
+
+ lastVisitsPath
+ The path in the HDFS to save the responses of the last visit requests
+
+
+ archiveRequestsPath
+ The path in the HDFS to save the responses of the archive requests
+
+
+ maxNumberOfRetry
+ Max number of retries for failed API calls
+
+
+ retryDelay
+ Retry delay for failed requests (in sec)
+
+
+ requestDelay
+ Delay between API requests (in ms)
resume
@@ -75,9 +95,9 @@
--softwareCodeRepositoryURLs${softwareCodeRepositoryURLs}
--lastVisitsPath${lastVisitsPath}
- --maxNumberOfRetry2
- --requestDelay0
- --retryDelay1
+ --maxNumberOfRetry${maxNumberOfRetry}
+ --requestDelay${requestDelay}
+ --retryDelay${retryDelay}
--requestMethodGET
@@ -91,11 +111,12 @@
--namenode${nameNode}
--lastVisitsPath${lastVisitsPath}
+ --archiveRequestsPath${archiveRequestsPath}
--archiveThresholdInDays365
- --maxNumberOfRetry2
- --requestDelay0
- --retryDelay1
+ --maxNumberOfRetry${maxNumberOfRetry}
+ --requestDelay${requestDelay}
+ --retryDelay${retryDelay}
--requestMethodPOST
diff --git a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java
index 06e40ae14..e069e9655 100644
--- a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java
+++ b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java
@@ -1,35 +1,38 @@
-package eu.dnetlib.dhp.swh;
-import eu.dnetlib.dhp.swh.utils.SWHUtils;
-import org.apache.hadoop.fs.FileSystem;
-import org.junit.jupiter.api.Test;
+package eu.dnetlib.dhp.swh;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
+import java.text.ParseException;
import java.util.Arrays;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.Test;
+
+import eu.dnetlib.dhp.swh.utils.SWHUtils;
+
public class ArchiveRepositoryURLsTest {
- @Test
- void testArchive() throws IOException {
- String inputPath = getClass()
- .getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
- .getPath();
+ @Test
+ void testArchive() throws IOException, ParseException {
+ String inputPath = getClass()
+ .getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
+ .getPath();
- File file = new File(inputPath);
- FileReader fr = new FileReader(file);
- BufferedReader br = new BufferedReader(fr); //creates a buffering character input stream
+ File file = new File(inputPath);
+ FileReader fr = new FileReader(file);
+ BufferedReader br = new BufferedReader(fr); // creates a buffering character input stream
- String line;
- while((line = br.readLine()) != null) {
- String[] tokens = line.split("\t");
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] tokens = line.split("\t");
- String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
- System.out.println(tokens[0] + "\t" + response);
- System.out.println();
- }
- fr.close();
- }
+ String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
+ System.out.println(tokens[0] + "\t" + response);
+ System.out.println();
+ }
+ fr.close();
+ }
}
diff --git a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java
index d69f6ff1b..28210f1b3 100644
--- a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java
+++ b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java
@@ -1,17 +1,18 @@
package eu.dnetlib.dhp.swh;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
//import org.apache.hadoop.hdfs.MiniDFSCluster;
@@ -24,7 +25,7 @@ public class SWHConnectionTest {
HttpClientParams clientParams = new HttpClientParams();
clientParams.setRequestMethod("GET");
- SWHConnection swhConnection = new SWHConnection(clientParams);
+ SWHConnection swhConnection = new SWHConnection(clientParams);
String repoUrl = "https://github.com/stanford-futuredata/FAST";
URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl));
@@ -42,7 +43,7 @@ public class SWHConnectionTest {
HttpClientParams clientParams = new HttpClientParams();
clientParams.setRequestMethod("POST");
- SWHConnection swhConnection = new SWHConnection(clientParams);
+ SWHConnection swhConnection = new SWHConnection(clientParams);
String repoUrl = "https://github.com/stanford-futuredata/FAST";
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, SWHConstants.DEFAULT_VISIT_TYPE, repoUrl));
diff --git a/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv b/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv
index 6477dd62a..568ccf482 100644
--- a/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv
+++ b/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv
@@ -1,3 +1,4 @@
+https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null}
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}