Add step for archiving repoUrls to SWH

This commit is contained in:
Serafeim Chatzopoulos 2023-09-28 20:56:18 +03:00
parent ed9c81a0b7
commit ab0d70691c
14 changed files with 230 additions and 118 deletions

View File

@ -53,9 +53,9 @@ public class HttpClientParams {
*/ */
private String requestMethod; private String requestMethod;
public HttpClientParams() { public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod); this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
_requestMethod);
} }
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut, public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,

View File

@ -1,14 +1,16 @@
package eu.dnetlib.dhp.swh; package eu.dnetlib.dhp.swh;
import com.fasterxml.jackson.databind.ObjectMapper; import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import java.io.IOException;
import eu.dnetlib.dhp.swh.models.LastVisitData; import java.net.URL;
import eu.dnetlib.dhp.swh.utils.SWHConnection; import java.text.SimpleDateFormat;
import eu.dnetlib.dhp.swh.utils.SWHConstants; import java.util.Date;
import eu.dnetlib.dhp.swh.utils.SWHUtils; import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.net.URL;
import java.util.Date;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.swh.models.LastVisitData;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
/** /**
* Sends archive requests to the SWH API for those software repository URLs that are missing from them * Sends archive requests to the SWH API for those software repository URLs that are missing from them
@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs {
} }
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException { private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays)
throws IOException {
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath); SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath); SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs {
// Read key-value pairs from the SequenceFile and handle appropriately // Read key-value pairs from the SequenceFile and handle appropriately
while (fr.next(repoUrl, lastVisitData)) { while (fr.next(repoUrl, lastVisitData)) {
String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays); String response = null;
try {
response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
} catch (java.text.ParseException e) {
log.error("Could not handle record with repo Url: {}", repoUrl.toString());
throw new RuntimeException(e);
}
// response is equal to null when no need for request // response is equal to null when no need for request
if (response != null) { if (response != null) {
@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs {
fr.close(); fr.close();
} }
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException { public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays)
System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData); throws IOException, java.text.ParseException {
log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData);
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class); LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
// perform an archive request when no repoUrl was not found in previous step // a previous attempt for archival has been made, and repository URL was not found
// avoid performing the same archive request again
if (lastVisit.getType() != null &&
lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) {
log.info("Avoid request -- previous archive request returned NOT_FOUND");
return null;
}
// if we have last visit data
if (lastVisit.getSnapshot() != null) { if (lastVisit.getSnapshot() != null) {
// OR last visit was before (now() - archiveThresholdInDays) String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate());
long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime());
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
if (archiveThresholdInDays >= diffInDays) { // and the last visit date can be parsed
return null; if (cleanDate != null) {
SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT);
Date lastVisitDate = formatter.parse(cleanDate);
// OR last visit time < (now() - archiveThresholdInDays)
long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime());
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
log.info("Date diff from now (in days): {}", diffInDays);
// do not perform a request, if the last visit date is no older than $archiveThresholdInDays
if (archiveThresholdInDays >= diffInDays) {
log.info("Avoid request -- no older than {} days", archiveThresholdInDays);
return null;
}
} }
} }
// if last visit data are available, re-use version control type, else use the default one (i.e., git) // ELSE perform an archive request
log.info("Perform archive request for: {}", repoUrl);
// if last visit data are available, re-use version control type,
// else use the default one (i.e., git)
String visitType = Optional String visitType = Optional
.ofNullable(lastVisit.getType()) .ofNullable(lastVisit.getType())
.orElse(SWHConstants.DEFAULT_VISIT_TYPE); .orElse(SWHConstants.DEFAULT_VISIT_TYPE);
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim())); URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
System.out.println(url.toString());
log.info("Sending archive request: {}", url);
String response; String response;
try { try {
response = swhConnection.call(url.toString()); response = swhConnection.call(url.toString());
} catch (CollectorException e) { } catch (CollectorException e) {
log.info("Error in request: {}", url); log.error("Error in request: {}", url);
response = "{}"; response = "{}";
} }
return response; return response;
} }
} }

View File

@ -1,12 +1,15 @@
package eu.dnetlib.dhp.swh; package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import java.io.BufferedReader;
import eu.dnetlib.dhp.swh.utils.SWHConnection; import java.io.IOException;
import eu.dnetlib.dhp.swh.utils.SWHConstants; import java.io.InputStreamReader;
import eu.dnetlib.dhp.swh.utils.SWHUtils; import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedReader; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.io.IOException; import eu.dnetlib.dhp.common.collection.CollectorException;
import java.io.InputStreamReader; import eu.dnetlib.dhp.common.collection.HttpClientParams;
import java.net.URISyntaxException; import eu.dnetlib.dhp.swh.utils.SWHConnection;
import java.net.URL; import eu.dnetlib.dhp.swh.utils.SWHConstants;
import java.nio.charset.StandardCharsets; import eu.dnetlib.dhp.swh.utils.SWHUtils;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
/** /**
* Given a file with software repository URLs, this class * Given a file with software repository URLs, this class
@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData {
try { try {
response = swhConnection.call(url.toString()); response = swhConnection.call(url.toString());
} catch (CollectorException e) { } catch (CollectorException e) {
log.info("Error in request: {}", url); log.error("Error in request: {}", url);
response = "{}"; response = "{}";
} }

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.swh; package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import eu.dnetlib.dhp.schema.oaf.Result;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.Serializable; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.Optional; import eu.dnetlib.dhp.schema.oaf.Result;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
/** /**
* Collects unique software repository URLs in the Graph using Hive * Collects unique software repository URLs in the Graph using Hive
@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable {
"WHERE coderepositoryurl.value IS NOT NULL " + "WHERE coderepositoryurl.value IS NOT NULL " +
"AND datainfo.deletedbyinference = FALSE " + "AND datainfo.deletedbyinference = FALSE " +
"AND datainfo.invisible = FALSE " + "AND datainfo.invisible = FALSE " +
"LIMIT 1000"; // TODO remove "LIMIT 1000";
String query = String.format(queryTemplate, hiveDbName); String query = String.format(queryTemplate, hiveDbName);
log.info("Hive query to fetch software code URLs: {}", query); log.info("Hive query to fetch software code URLs: {}", query);

View File

@ -1,21 +1,23 @@
package eu.dnetlib.dhp.swh.models; package eu.dnetlib.dhp.swh.models;
import java.util.Date;
import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat;
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty; import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import java.util.Date;
@JsonIgnoreProperties(ignoreUnknown = true) @JsonIgnoreProperties(ignoreUnknown = true)
public class LastVisitData { public class LastVisitData {
private String type; private String type;
private String date;
private Date date;
@JsonProperty("snapshot") @JsonProperty("snapshot")
private String snapshotId; private String snapshotId;
private String status;
public String getType() { public String getType() {
return type; return type;
} }
@ -24,11 +26,11 @@ public class LastVisitData {
this.type = type; this.type = type;
} }
public Date getDate() { public String getDate() {
return date; return date;
} }
public void setDate(Date date) { public void setDate(String date) {
this.date = date; this.date = date;
} }
@ -39,4 +41,12 @@ public class LastVisitData {
public void setSnapshot(String snapshotId) { public void setSnapshot(String snapshotId) {
this.snapshotId = snapshotId; this.snapshotId = snapshotId;
} }
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
} }

View File

@ -6,8 +6,10 @@ public class SWHConstants {
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/"; public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
public static final String ACCESS_TOKEN = ""; public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs";
public static final String DEFAULT_VISIT_TYPE = "git"; public static final String DEFAULT_VISIT_TYPE = "git";
public static final String VISIT_STATUS_NOT_FOUND = "not_found";
} }

View File

@ -1,8 +1,14 @@
package eu.dnetlib.dhp.swh.utils; package eu.dnetlib.dhp.swh.utils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.Constants.*;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedReader; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.io.IOException; import eu.dnetlib.dhp.common.collection.HttpClientParams;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import static eu.dnetlib.dhp.common.Constants.*;
public class SWHUtils { public class SWHUtils {
@ -51,10 +52,10 @@ public class SWHUtils {
log.info("retryDelay is {}", clientParams.getRetryDelay()); log.info("retryDelay is {}", clientParams.getRetryDelay());
clientParams clientParams
.setRequestMethod( .setRequestMethod(
Optional Optional
.ofNullable(argumentParser.get(REQUEST_METHOD)) .ofNullable(argumentParser.get(REQUEST_METHOD))
.orElse(HttpClientParams._requestMethod)); .orElse(HttpClientParams._requestMethod));
log.info("requestMethod is {}", clientParams.getRequestMethod()); log.info("requestMethod is {}", clientParams.getRequestMethod());
return clientParams; return clientParams;
@ -63,16 +64,16 @@ public class SWHUtils {
public static BufferedReader getFileReader(FileSystem fs, Path inputPath) throws IOException { public static BufferedReader getFileReader(FileSystem fs, Path inputPath) throws IOException {
FSDataInputStream inputStream = fs.open(inputPath); FSDataInputStream inputStream = fs.open(inputPath);
return new BufferedReader( return new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8)); new InputStreamReader(inputStream, StandardCharsets.UTF_8));
} }
public static SequenceFile.Writer getSequenceFileWriter(FileSystem fs, String outputPath) throws IOException { public static SequenceFile.Writer getSequenceFileWriter(FileSystem fs, String outputPath) throws IOException {
return SequenceFile return SequenceFile
.createWriter( .createWriter(
fs.getConf(), fs.getConf(),
SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class)); SequenceFile.Writer.valueClass(Text.class));
} }
public static SequenceFile.Reader getSequenceFileReader(FileSystem fs, String inputPath) throws IOException { public static SequenceFile.Reader getSequenceFileReader(FileSystem fs, String inputPath) throws IOException {

View File

@ -11,12 +11,36 @@
"paramDescription": "the URL where to store last visits data", "paramDescription": "the URL where to store last visits data",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "arp",
"paramLongName": "archiveRequestsPath",
"paramDescription": "the URL where to store the responses of the archive requests",
"paramRequired": true
},
{
"paramName": "mnr",
"paramLongName": "maxNumberOfRetry",
"paramDescription": "the maximum number of admitted connection retries",
"paramRequired": false
},
{ {
"paramName": "rqd", "paramName": "rqd",
"paramLongName": "requestDelay", "paramLongName": "requestDelay",
"paramDescription": "the delay (ms) between requests", "paramDescription": "the delay (ms) between requests",
"paramRequired": false "paramRequired": false
}, },
{
"paramName": "rtd",
"paramLongName": "retryDelay",
"paramDescription": "the delay (ms) between retries",
"paramRequired": false
},
{
"paramName": "rm",
"paramLongName": "requestMethod",
"paramDescription": "the method of the requests to perform",
"paramRequired": false
},
{ {
"paramName": "atid", "paramName": "atid",
"paramLongName": "archiveThresholdInDays", "paramLongName": "archiveThresholdInDays",

View File

@ -34,5 +34,11 @@
"paramLongName": "retryDelay", "paramLongName": "retryDelay",
"paramDescription": "the delay (ms) between retries", "paramDescription": "the delay (ms) between retries",
"paramRequired": false "paramRequired": false
},
{
"paramName": "rm",
"paramLongName": "requestMethod",
"paramDescription": "the method of the requests to perform",
"paramRequired": false
} }
] ]

View File

@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv
lastVisitsPath=${workingDir}/2_last_visits.seq lastVisitsPath=${workingDir}/2_last_visits.seq
archiveRequestsPath=${workingDir}/3_archive_requests.seq archiveRequestsPath=${workingDir}/3_archive_requests.seq
maxNumberOfRetry=2
retryDelay=1
requestDelay=100
resume=collect-software-repository-urls resume=collect-software-repository-urls

View File

@ -8,7 +8,27 @@
</property> </property>
<property> <property>
<name>softwareCodeRepositoryURLs</name> <name>softwareCodeRepositoryURLs</name>
<description>The path in the HDSF to save the software repository URLs</description> <description>The path in the HDFS to save the software repository URLs</description>
</property>
<property>
<name>lastVisitsPath</name>
<description>The path in the HDFS to save the responses of the last visit requests</description>
</property>
<property>
<name>archiveRequestsPath</name>
<description>The path in the HDFS to save the responses of the archive requests</description>
</property>
<property>
<name>maxNumberOfRetry</name>
<description>Max number of retries for failed API calls</description>
</property>
<property>
<name>retryDelay</name>
<description>Retry delay for failed requests (in sec)</description>
</property>
<property>
<name>requestDelay</name>
<description>Delay between API requests (in ms)</description>
</property> </property>
<property> <property>
<name>resume</name> <name>resume</name>
@ -75,9 +95,9 @@
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg> <arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg> <arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
<arg>--maxNumberOfRetry</arg><arg>2</arg> <arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
<arg>--requestDelay</arg><arg>0</arg> <arg>--requestDelay</arg><arg>${requestDelay}</arg>
<arg>--retryDelay</arg><arg>1</arg> <arg>--retryDelay</arg><arg>${retryDelay}</arg>
<arg>--requestMethod</arg><arg>GET</arg> <arg>--requestMethod</arg><arg>GET</arg>
</java> </java>
@ -91,11 +111,12 @@
<arg>--namenode</arg><arg>${nameNode}</arg> <arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg> <arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
<arg>--archiveRequestsPath</arg><arg>${archiveRequestsPath}</arg>
<arg>--archiveThresholdInDays</arg><arg>365</arg> <arg>--archiveThresholdInDays</arg><arg>365</arg>
<arg>--maxNumberOfRetry</arg><arg>2</arg> <arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
<arg>--requestDelay</arg><arg>0</arg> <arg>--requestDelay</arg><arg>${requestDelay}</arg>
<arg>--retryDelay</arg><arg>1</arg> <arg>--retryDelay</arg><arg>${retryDelay}</arg>
<arg>--requestMethod</arg><arg>POST</arg> <arg>--requestMethod</arg><arg>POST</arg>
</java> </java>

View File

@ -1,35 +1,38 @@
package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.swh.utils.SWHUtils; package eu.dnetlib.dhp.swh;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.Test;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays; import java.util.Arrays;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
public class ArchiveRepositoryURLsTest { public class ArchiveRepositoryURLsTest {
@Test @Test
void testArchive() throws IOException { void testArchive() throws IOException, ParseException {
String inputPath = getClass() String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv") .getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
.getPath(); .getPath();
File file = new File(inputPath); File file = new File(inputPath);
FileReader fr = new FileReader(file); FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr); //creates a buffering character input stream BufferedReader br = new BufferedReader(fr); // creates a buffering character input stream
String line; String line;
while((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
String[] tokens = line.split("\t"); String[] tokens = line.split("\t");
String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365); String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
System.out.println(tokens[0] + "\t" + response); System.out.println(tokens[0] + "\t" + response);
System.out.println(); System.out.println();
} }
fr.close(); fr.close();
} }
} }

View File

@ -1,17 +1,18 @@
package eu.dnetlib.dhp.swh; package eu.dnetlib.dhp.swh;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection; import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants; import eu.dnetlib.dhp.swh.utils.SWHConstants;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
//import org.apache.hadoop.hdfs.MiniDFSCluster; //import org.apache.hadoop.hdfs.MiniDFSCluster;
@ -24,7 +25,7 @@ public class SWHConnectionTest {
HttpClientParams clientParams = new HttpClientParams(); HttpClientParams clientParams = new HttpClientParams();
clientParams.setRequestMethod("GET"); clientParams.setRequestMethod("GET");
SWHConnection swhConnection = new SWHConnection(clientParams); SWHConnection swhConnection = new SWHConnection(clientParams);
String repoUrl = "https://github.com/stanford-futuredata/FAST"; String repoUrl = "https://github.com/stanford-futuredata/FAST";
URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl)); URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl));
@ -42,7 +43,7 @@ public class SWHConnectionTest {
HttpClientParams clientParams = new HttpClientParams(); HttpClientParams clientParams = new HttpClientParams();
clientParams.setRequestMethod("POST"); clientParams.setRequestMethod("POST");
SWHConnection swhConnection = new SWHConnection(clientParams); SWHConnection swhConnection = new SWHConnection(clientParams);
String repoUrl = "https://github.com/stanford-futuredata/FAST"; String repoUrl = "https://github.com/stanford-futuredata/FAST";
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, SWHConstants.DEFAULT_VISIT_TYPE, repoUrl)); URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, SWHConstants.DEFAULT_VISIT_TYPE, repoUrl));

View File

@ -1,3 +1,4 @@
https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null}
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"} https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {} https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"} https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}

Can't render this file because it contains an unexpected character in line 1 and column 40.