From ab0d70691cf4c21b886142b9a5d2b7327d6445d5 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Thu, 28 Sep 2023 20:56:18 +0300 Subject: [PATCH] Add step for archiving repoUrls to SWH --- .../common/collection/HttpClientParams.java | 4 +- .../dhp/swh/ArchiveRepositoryURLs.java | 99 +++++++++++++------ .../swh/CollectLastVisitRepositoryData.java | 31 +++--- .../swh/CollectSoftwareRepositoryURLs.java | 15 +-- .../dnetlib/dhp/swh/models/LastVisitData.java | 22 +++-- .../dnetlib/dhp/swh/utils/SWHConstants.java | 4 +- .../eu/dnetlib/dhp/swh/utils/SWHUtils.java | 39 ++++---- .../swh/input_archive_repository_urls.json | 24 +++++ ...ut_collect_last_visit_repository_data.json | 6 ++ .../eu/dnetlib/dhp/swh/job.properties | 4 + .../eu/dnetlib/dhp/swh/oozie_app/workflow.xml | 35 +++++-- .../dhp/swh/ArchiveRepositoryURLsTest.java | 45 +++++---- .../eu/dnetlib/dhp/swh/SWHConnectionTest.java | 19 ++-- .../dhp/swh/lastVisitDataToArchive.csv | 1 + 14 files changed, 230 insertions(+), 118 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java index 55f9ceb8b..d26d9c0e9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java @@ -53,9 +53,9 @@ public class HttpClientParams { */ private String requestMethod; - public HttpClientParams() { - this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod); + this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), + _requestMethod); } public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut, diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java index 7b3b74d9e..38db27baf 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLs.java @@ -1,14 +1,16 @@ package eu.dnetlib.dhp.swh; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.collection.CollectorException; -import eu.dnetlib.dhp.common.collection.HttpClientParams; -import eu.dnetlib.dhp.swh.models.LastVisitData; -import eu.dnetlib.dhp.swh.utils.SWHConnection; -import eu.dnetlib.dhp.swh.utils.SWHConstants; -import eu.dnetlib.dhp.swh.utils.SWHUtils; +import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD; +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; + +import java.io.IOException; +import java.net.URL; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; @@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.net.URL; -import java.util.Date; -import java.util.Optional; -import java.util.concurrent.TimeUnit; +import com.fasterxml.jackson.databind.ObjectMapper; -import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD; -import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; +import eu.dnetlib.dhp.swh.models.LastVisitData; +import eu.dnetlib.dhp.swh.utils.SWHConnection; +import eu.dnetlib.dhp.swh.utils.SWHConstants; +import eu.dnetlib.dhp.swh.utils.SWHUtils; /** * Sends archive requests to the SWH API for those software repository URLs that are missing from them @@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs { } - private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException { + private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) + throws IOException { SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath); SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath); @@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs { // Read key-value pairs from the SequenceFile and handle appropriately while (fr.next(repoUrl, lastVisitData)) { - String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays); + String response = null; + try { + response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays); + } catch (java.text.ParseException e) { + log.error("Could not handle record with repo Url: {}", repoUrl.toString()); + throw new RuntimeException(e); + } // response is equal to null when no need for request if (response != null) { @@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs { fr.close(); } - public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException { - System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData); + public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) + throws IOException, java.text.ParseException { + + log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData); LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class); - // perform an archive request when no repoUrl was not found in previous step + // a previous attempt for archival has been made, and repository URL was not found + // avoid performing the same archive request again + if (lastVisit.getType() != null && + lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) { + + log.info("Avoid request -- previous archive request returned NOT_FOUND"); + return null; + } + + // if we have last visit data if (lastVisit.getSnapshot() != null) { - // OR last visit was before (now() - archiveThresholdInDays) - long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime()); - long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS); + String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate()); - if (archiveThresholdInDays >= diffInDays) { - return null; + // and the last visit date can be parsed + if (cleanDate != null) { + + SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT); + Date lastVisitDate = formatter.parse(cleanDate); + + // OR last visit time < (now() - archiveThresholdInDays) + long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime()); + long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS); + log.info("Date diff from now (in days): {}", diffInDays); + + // do not perform a request, if the last visit date is no older than $archiveThresholdInDays + if (archiveThresholdInDays >= diffInDays) { + log.info("Avoid request -- no older than {} days", archiveThresholdInDays); + return null; + } } } - // if last visit data are available, re-use version control type, else use the default one (i.e., git) + // ELSE perform an archive request + log.info("Perform archive request for: {}", repoUrl); + + // if last visit data are available, re-use version control type, + // else use the default one (i.e., git) String visitType = Optional .ofNullable(lastVisit.getType()) .orElse(SWHConstants.DEFAULT_VISIT_TYPE); URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim())); - System.out.println(url.toString()); + + log.info("Sending archive request: {}", url); String response; try { response = swhConnection.call(url.toString()); } catch (CollectorException e) { - log.info("Error in request: {}", url); + log.error("Error in request: {}", url); response = "{}"; } return response; - } - - } diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java index c4b6412b5..9386b6876 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java @@ -1,12 +1,15 @@ package eu.dnetlib.dhp.swh; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.collection.CollectorException; -import eu.dnetlib.dhp.common.collection.HttpClientParams; -import eu.dnetlib.dhp.swh.utils.SWHConnection; -import eu.dnetlib.dhp.swh.utils.SWHConstants; -import eu.dnetlib.dhp.swh.utils.SWHUtils; +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.StandardCharsets; + import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FSDataInputStream; @@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URISyntaxException; -import java.net.URL; -import java.nio.charset.StandardCharsets; - -import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; +import eu.dnetlib.dhp.swh.utils.SWHConnection; +import eu.dnetlib.dhp.swh.utils.SWHConstants; +import eu.dnetlib.dhp.swh.utils.SWHUtils; /** * Given a file with software repository URLs, this class @@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData { try { response = swhConnection.call(url.toString()); } catch (CollectorException e) { - log.info("Error in request: {}", url); + log.error("Error in request: {}", url); response = "{}"; } diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java index f93280b5e..c1a0fafa5 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java @@ -1,8 +1,11 @@ package eu.dnetlib.dhp.swh; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Result; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.io.Serializable; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; @@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.Serializable; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Result; /** * Collects unique software repository URLs in the Graph using Hive @@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable { "WHERE coderepositoryurl.value IS NOT NULL " + "AND datainfo.deletedbyinference = FALSE " + "AND datainfo.invisible = FALSE " + - "LIMIT 1000"; // TODO remove + "LIMIT 1000"; String query = String.format(queryTemplate, hiveDbName); log.info("Hive query to fetch software code URLs: {}", query); diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java index b8cd6de6e..eaff5ce02 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java @@ -1,21 +1,23 @@ package eu.dnetlib.dhp.swh.models; +import java.util.Date; + +import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat; import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import java.util.Date; - @JsonIgnoreProperties(ignoreUnknown = true) public class LastVisitData { private String type; - - private Date date; + private String date; @JsonProperty("snapshot") private String snapshotId; + private String status; + public String getType() { return type; } @@ -24,11 +26,11 @@ public class LastVisitData { this.type = type; } - public Date getDate() { + public String getDate() { return date; } - public void setDate(Date date) { + public void setDate(String date) { this.date = date; } @@ -39,4 +41,12 @@ public class LastVisitData { public void setSnapshot(String snapshotId) { this.snapshotId = snapshotId; } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } } diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java index 1299bc805..f58705188 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHConstants.java @@ -6,8 +6,10 @@ public class SWHConstants { public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/"; - public static final String ACCESS_TOKEN = ""; + public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs"; public static final String DEFAULT_VISIT_TYPE = "git"; + public static final String VISIT_STATUS_NOT_FOUND = "not_found"; + } diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java index 8200e7b34..405ce51e4 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/utils/SWHUtils.java @@ -1,8 +1,14 @@ package eu.dnetlib.dhp.swh.utils; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.collection.HttpClientParams; +import static eu.dnetlib.dhp.common.Constants.*; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Optional; + import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.Constants.*; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.HttpClientParams; public class SWHUtils { @@ -51,10 +52,10 @@ public class SWHUtils { log.info("retryDelay is {}", clientParams.getRetryDelay()); clientParams - .setRequestMethod( - Optional - .ofNullable(argumentParser.get(REQUEST_METHOD)) - .orElse(HttpClientParams._requestMethod)); + .setRequestMethod( + Optional + .ofNullable(argumentParser.get(REQUEST_METHOD)) + .orElse(HttpClientParams._requestMethod)); log.info("requestMethod is {}", clientParams.getRequestMethod()); return clientParams; @@ -63,16 +64,16 @@ public class SWHUtils { public static BufferedReader getFileReader(FileSystem fs, Path inputPath) throws IOException { FSDataInputStream inputStream = fs.open(inputPath); return new BufferedReader( - new InputStreamReader(inputStream, StandardCharsets.UTF_8)); + new InputStreamReader(inputStream, StandardCharsets.UTF_8)); } public static SequenceFile.Writer getSequenceFileWriter(FileSystem fs, String outputPath) throws IOException { return SequenceFile - .createWriter( - fs.getConf(), - SequenceFile.Writer.file(new Path(outputPath)), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class)); + .createWriter( + fs.getConf(), + SequenceFile.Writer.file(new Path(outputPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); } public static SequenceFile.Reader getSequenceFileReader(FileSystem fs, String inputPath) throws IOException { diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json index 5ec481305..ce80d6f4a 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_archive_repository_urls.json @@ -11,12 +11,36 @@ "paramDescription": "the URL where to store last visits data", "paramRequired": true }, + { + "paramName": "arp", + "paramLongName": "archiveRequestsPath", + "paramDescription": "the URL where to store the responses of the archive requests", + "paramRequired": true + }, + { + "paramName": "mnr", + "paramLongName": "maxNumberOfRetry", + "paramDescription": "the maximum number of admitted connection retries", + "paramRequired": false + }, { "paramName": "rqd", "paramLongName": "requestDelay", "paramDescription": "the delay (ms) between requests", "paramRequired": false }, + { + "paramName": "rtd", + "paramLongName": "retryDelay", + "paramDescription": "the delay (ms) between retries", + "paramRequired": false + }, + { + "paramName": "rm", + "paramLongName": "requestMethod", + "paramDescription": "the method of the requests to perform", + "paramRequired": false + }, { "paramName": "atid", "paramLongName": "archiveThresholdInDays", diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json index 6c59123be..8bf41f0ae 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json @@ -34,5 +34,11 @@ "paramLongName": "retryDelay", "paramDescription": "the delay (ms) between retries", "paramRequired": false + }, + { + "paramName": "rm", + "paramLongName": "requestMethod", + "paramDescription": "the method of the requests to perform", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties index e2c2af852..4cc1c1e25 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties @@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv lastVisitsPath=${workingDir}/2_last_visits.seq archiveRequestsPath=${workingDir}/3_archive_requests.seq +maxNumberOfRetry=2 +retryDelay=1 +requestDelay=100 + resume=collect-software-repository-urls diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml index 5062d562b..b89165fa2 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml @@ -8,7 +8,27 @@ softwareCodeRepositoryURLs - The path in the HDSF to save the software repository URLs + The path in the HDFS to save the software repository URLs + + + lastVisitsPath + The path in the HDFS to save the responses of the last visit requests + + + archiveRequestsPath + The path in the HDFS to save the responses of the archive requests + + + maxNumberOfRetry + Max number of retries for failed API calls + + + retryDelay + Retry delay for failed requests (in sec) + + + requestDelay + Delay between API requests (in ms) resume @@ -75,9 +95,9 @@ --softwareCodeRepositoryURLs${softwareCodeRepositoryURLs} --lastVisitsPath${lastVisitsPath} - --maxNumberOfRetry2 - --requestDelay0 - --retryDelay1 + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} --requestMethodGET @@ -91,11 +111,12 @@ --namenode${nameNode} --lastVisitsPath${lastVisitsPath} + --archiveRequestsPath${archiveRequestsPath} --archiveThresholdInDays365 - --maxNumberOfRetry2 - --requestDelay0 - --retryDelay1 + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} --requestMethodPOST diff --git a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java index 06e40ae14..e069e9655 100644 --- a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java +++ b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/ArchiveRepositoryURLsTest.java @@ -1,35 +1,38 @@ -package eu.dnetlib.dhp.swh; -import eu.dnetlib.dhp.swh.utils.SWHUtils; -import org.apache.hadoop.fs.FileSystem; -import org.junit.jupiter.api.Test; +package eu.dnetlib.dhp.swh; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.text.ParseException; import java.util.Arrays; +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.swh.utils.SWHUtils; + public class ArchiveRepositoryURLsTest { - @Test - void testArchive() throws IOException { - String inputPath = getClass() - .getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv") - .getPath(); + @Test + void testArchive() throws IOException, ParseException { + String inputPath = getClass() + .getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv") + .getPath(); - File file = new File(inputPath); - FileReader fr = new FileReader(file); - BufferedReader br = new BufferedReader(fr); //creates a buffering character input stream + File file = new File(inputPath); + FileReader fr = new FileReader(file); + BufferedReader br = new BufferedReader(fr); // creates a buffering character input stream - String line; - while((line = br.readLine()) != null) { - String[] tokens = line.split("\t"); + String line; + while ((line = br.readLine()) != null) { + String[] tokens = line.split("\t"); - String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365); - System.out.println(tokens[0] + "\t" + response); - System.out.println(); - } - fr.close(); - } + String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365); + System.out.println(tokens[0] + "\t" + response); + System.out.println(); + } + fr.close(); + } } diff --git a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java index d69f6ff1b..28210f1b3 100644 --- a/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java +++ b/dhp-workflows/dhp-swh/src/test/java/eu/dnetlib/dhp/swh/SWHConnectionTest.java @@ -1,17 +1,18 @@ package eu.dnetlib.dhp.swh; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; + +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.swh.utils.SWHConnection; import eu.dnetlib.dhp.swh.utils.SWHConstants; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; //import org.apache.hadoop.hdfs.MiniDFSCluster; @@ -24,7 +25,7 @@ public class SWHConnectionTest { HttpClientParams clientParams = new HttpClientParams(); clientParams.setRequestMethod("GET"); - SWHConnection swhConnection = new SWHConnection(clientParams); + SWHConnection swhConnection = new SWHConnection(clientParams); String repoUrl = "https://github.com/stanford-futuredata/FAST"; URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl)); @@ -42,7 +43,7 @@ public class SWHConnectionTest { HttpClientParams clientParams = new HttpClientParams(); clientParams.setRequestMethod("POST"); - SWHConnection swhConnection = new SWHConnection(clientParams); + SWHConnection swhConnection = new SWHConnection(clientParams); String repoUrl = "https://github.com/stanford-futuredata/FAST"; URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, SWHConstants.DEFAULT_VISIT_TYPE, repoUrl)); diff --git a/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv b/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv index 6477dd62a..568ccf482 100644 --- a/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv +++ b/dhp-workflows/dhp-swh/src/test/resources/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv @@ -1,3 +1,4 @@ +https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null} https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"} https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {} https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}