forked from D-Net/dnet-hadoop
Add step for archiving repoUrls to SWH
This commit is contained in:
parent
ed9c81a0b7
commit
ab0d70691c
|
@ -53,9 +53,9 @@ public class HttpClientParams {
|
||||||
*/
|
*/
|
||||||
private String requestMethod;
|
private String requestMethod;
|
||||||
|
|
||||||
|
|
||||||
public HttpClientParams() {
|
public HttpClientParams() {
|
||||||
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod);
|
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
|
||||||
|
_requestMethod);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
|
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
|
||||||
|
|
|
@ -1,14 +1,16 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh;
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.swh.models.LastVisitData;
|
import java.net.URL;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
import java.text.SimpleDateFormat;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
import java.util.Date;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.net.URL;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
|
import eu.dnetlib.dhp.swh.models.LastVisitData;
|
||||||
|
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||||
|
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||||
|
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends archive requests to the SWH API for those software repository URLs that are missing from them
|
* Sends archive requests to the SWH API for those software repository URLs that are missing from them
|
||||||
|
@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException {
|
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
|
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
|
||||||
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
|
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
|
||||||
|
@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs {
|
||||||
// Read key-value pairs from the SequenceFile and handle appropriately
|
// Read key-value pairs from the SequenceFile and handle appropriately
|
||||||
while (fr.next(repoUrl, lastVisitData)) {
|
while (fr.next(repoUrl, lastVisitData)) {
|
||||||
|
|
||||||
String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
|
String response = null;
|
||||||
|
try {
|
||||||
|
response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
|
||||||
|
} catch (java.text.ParseException e) {
|
||||||
|
log.error("Could not handle record with repo Url: {}", repoUrl.toString());
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
// response is equal to null when no need for request
|
// response is equal to null when no need for request
|
||||||
if (response != null) {
|
if (response != null) {
|
||||||
|
@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs {
|
||||||
fr.close();
|
fr.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException {
|
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays)
|
||||||
System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData);
|
throws IOException, java.text.ParseException {
|
||||||
|
|
||||||
|
log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData);
|
||||||
|
|
||||||
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
|
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
|
||||||
|
|
||||||
// perform an archive request when no repoUrl was not found in previous step
|
// a previous attempt for archival has been made, and repository URL was not found
|
||||||
|
// avoid performing the same archive request again
|
||||||
|
if (lastVisit.getType() != null &&
|
||||||
|
lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) {
|
||||||
|
|
||||||
|
log.info("Avoid request -- previous archive request returned NOT_FOUND");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we have last visit data
|
||||||
if (lastVisit.getSnapshot() != null) {
|
if (lastVisit.getSnapshot() != null) {
|
||||||
|
|
||||||
// OR last visit was before (now() - archiveThresholdInDays)
|
String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate());
|
||||||
long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime());
|
|
||||||
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
|
|
||||||
|
|
||||||
|
// and the last visit date can be parsed
|
||||||
|
if (cleanDate != null) {
|
||||||
|
|
||||||
|
SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT);
|
||||||
|
Date lastVisitDate = formatter.parse(cleanDate);
|
||||||
|
|
||||||
|
// OR last visit time < (now() - archiveThresholdInDays)
|
||||||
|
long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime());
|
||||||
|
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
|
||||||
|
log.info("Date diff from now (in days): {}", diffInDays);
|
||||||
|
|
||||||
|
// do not perform a request, if the last visit date is no older than $archiveThresholdInDays
|
||||||
if (archiveThresholdInDays >= diffInDays) {
|
if (archiveThresholdInDays >= diffInDays) {
|
||||||
|
log.info("Avoid request -- no older than {} days", archiveThresholdInDays);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if last visit data are available, re-use version control type, else use the default one (i.e., git)
|
// ELSE perform an archive request
|
||||||
|
log.info("Perform archive request for: {}", repoUrl);
|
||||||
|
|
||||||
|
// if last visit data are available, re-use version control type,
|
||||||
|
// else use the default one (i.e., git)
|
||||||
String visitType = Optional
|
String visitType = Optional
|
||||||
.ofNullable(lastVisit.getType())
|
.ofNullable(lastVisit.getType())
|
||||||
.orElse(SWHConstants.DEFAULT_VISIT_TYPE);
|
.orElse(SWHConstants.DEFAULT_VISIT_TYPE);
|
||||||
|
|
||||||
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
|
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
|
||||||
System.out.println(url.toString());
|
|
||||||
|
log.info("Sending archive request: {}", url);
|
||||||
|
|
||||||
String response;
|
String response;
|
||||||
try {
|
try {
|
||||||
response = swhConnection.call(url.toString());
|
response = swhConnection.call(url.toString());
|
||||||
} catch (CollectorException e) {
|
} catch (CollectorException e) {
|
||||||
log.info("Error in request: {}", url);
|
log.error("Error in request: {}", url);
|
||||||
response = "{}";
|
response = "{}";
|
||||||
}
|
}
|
||||||
|
|
||||||
return response;
|
return response;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh;
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import java.io.BufferedReader;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
import java.io.InputStreamReader;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.io.IOException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import java.io.InputStreamReader;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
import java.net.URISyntaxException;
|
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||||
import java.net.URL;
|
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||||
import java.nio.charset.StandardCharsets;
|
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a file with software repository URLs, this class
|
* Given a file with software repository URLs, this class
|
||||||
|
@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData {
|
||||||
try {
|
try {
|
||||||
response = swhConnection.call(url.toString());
|
response = swhConnection.call(url.toString());
|
||||||
} catch (CollectorException e) {
|
} catch (CollectorException e) {
|
||||||
log.info("Error in request: {}", url);
|
log.error("Error in request: {}", url);
|
||||||
response = "{}";
|
response = "{}";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh;
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.util.Optional;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Collects unique software repository URLs in the Graph using Hive
|
* Collects unique software repository URLs in the Graph using Hive
|
||||||
|
@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable {
|
||||||
"WHERE coderepositoryurl.value IS NOT NULL " +
|
"WHERE coderepositoryurl.value IS NOT NULL " +
|
||||||
"AND datainfo.deletedbyinference = FALSE " +
|
"AND datainfo.deletedbyinference = FALSE " +
|
||||||
"AND datainfo.invisible = FALSE " +
|
"AND datainfo.invisible = FALSE " +
|
||||||
"LIMIT 1000"; // TODO remove
|
"LIMIT 1000";
|
||||||
String query = String.format(queryTemplate, hiveDbName);
|
String query = String.format(queryTemplate, hiveDbName);
|
||||||
|
|
||||||
log.info("Hive query to fetch software code URLs: {}", query);
|
log.info("Hive query to fetch software code URLs: {}", query);
|
||||||
|
|
|
@ -1,21 +1,23 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh.models;
|
package eu.dnetlib.dhp.swh.models;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat;
|
||||||
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
|
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||||
public class LastVisitData {
|
public class LastVisitData {
|
||||||
|
|
||||||
private String type;
|
private String type;
|
||||||
|
private String date;
|
||||||
private Date date;
|
|
||||||
|
|
||||||
@JsonProperty("snapshot")
|
@JsonProperty("snapshot")
|
||||||
private String snapshotId;
|
private String snapshotId;
|
||||||
|
|
||||||
|
private String status;
|
||||||
|
|
||||||
public String getType() {
|
public String getType() {
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
@ -24,11 +26,11 @@ public class LastVisitData {
|
||||||
this.type = type;
|
this.type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getDate() {
|
public String getDate() {
|
||||||
return date;
|
return date;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDate(Date date) {
|
public void setDate(String date) {
|
||||||
this.date = date;
|
this.date = date;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,4 +41,12 @@ public class LastVisitData {
|
||||||
public void setSnapshot(String snapshotId) {
|
public void setSnapshot(String snapshotId) {
|
||||||
this.snapshotId = snapshotId;
|
this.snapshotId = snapshotId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getStatus() {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStatus(String status) {
|
||||||
|
this.status = status;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,8 +6,10 @@ public class SWHConstants {
|
||||||
|
|
||||||
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
|
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
|
||||||
|
|
||||||
public static final String ACCESS_TOKEN = "";
|
public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs";
|
||||||
|
|
||||||
public static final String DEFAULT_VISIT_TYPE = "git";
|
public static final String DEFAULT_VISIT_TYPE = "git";
|
||||||
|
|
||||||
|
public static final String VISIT_STATUS_NOT_FOUND = "not_found";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh.utils;
|
package eu.dnetlib.dhp.swh.utils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static eu.dnetlib.dhp.common.Constants.*;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.io.IOException;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.Constants.*;
|
|
||||||
|
|
||||||
public class SWHUtils {
|
public class SWHUtils {
|
||||||
|
|
||||||
|
|
|
@ -11,12 +11,36 @@
|
||||||
"paramDescription": "the URL where to store last visits data",
|
"paramDescription": "the URL where to store last visits data",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "arp",
|
||||||
|
"paramLongName": "archiveRequestsPath",
|
||||||
|
"paramDescription": "the URL where to store the responses of the archive requests",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mnr",
|
||||||
|
"paramLongName": "maxNumberOfRetry",
|
||||||
|
"paramDescription": "the maximum number of admitted connection retries",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "rqd",
|
"paramName": "rqd",
|
||||||
"paramLongName": "requestDelay",
|
"paramLongName": "requestDelay",
|
||||||
"paramDescription": "the delay (ms) between requests",
|
"paramDescription": "the delay (ms) between requests",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rtd",
|
||||||
|
"paramLongName": "retryDelay",
|
||||||
|
"paramDescription": "the delay (ms) between retries",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rm",
|
||||||
|
"paramLongName": "requestMethod",
|
||||||
|
"paramDescription": "the method of the requests to perform",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "atid",
|
"paramName": "atid",
|
||||||
"paramLongName": "archiveThresholdInDays",
|
"paramLongName": "archiveThresholdInDays",
|
||||||
|
|
|
@ -34,5 +34,11 @@
|
||||||
"paramLongName": "retryDelay",
|
"paramLongName": "retryDelay",
|
||||||
"paramDescription": "the delay (ms) between retries",
|
"paramDescription": "the delay (ms) between retries",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rm",
|
||||||
|
"paramLongName": "requestMethod",
|
||||||
|
"paramDescription": "the method of the requests to perform",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv
|
||||||
lastVisitsPath=${workingDir}/2_last_visits.seq
|
lastVisitsPath=${workingDir}/2_last_visits.seq
|
||||||
archiveRequestsPath=${workingDir}/3_archive_requests.seq
|
archiveRequestsPath=${workingDir}/3_archive_requests.seq
|
||||||
|
|
||||||
|
maxNumberOfRetry=2
|
||||||
|
retryDelay=1
|
||||||
|
requestDelay=100
|
||||||
|
|
||||||
resume=collect-software-repository-urls
|
resume=collect-software-repository-urls
|
||||||
|
|
|
@ -8,7 +8,27 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>softwareCodeRepositoryURLs</name>
|
<name>softwareCodeRepositoryURLs</name>
|
||||||
<description>The path in the HDSF to save the software repository URLs</description>
|
<description>The path in the HDFS to save the software repository URLs</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>lastVisitsPath</name>
|
||||||
|
<description>The path in the HDFS to save the responses of the last visit requests</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>archiveRequestsPath</name>
|
||||||
|
<description>The path in the HDFS to save the responses of the archive requests</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>maxNumberOfRetry</name>
|
||||||
|
<description>Max number of retries for failed API calls</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>retryDelay</name>
|
||||||
|
<description>Retry delay for failed requests (in sec)</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>requestDelay</name>
|
||||||
|
<description>Delay between API requests (in ms)</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>resume</name>
|
<name>resume</name>
|
||||||
|
@ -75,9 +95,9 @@
|
||||||
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
||||||
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
||||||
|
|
||||||
<arg>--maxNumberOfRetry</arg><arg>2</arg>
|
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
|
||||||
<arg>--requestDelay</arg><arg>0</arg>
|
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
|
||||||
<arg>--retryDelay</arg><arg>1</arg>
|
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
|
||||||
<arg>--requestMethod</arg><arg>GET</arg>
|
<arg>--requestMethod</arg><arg>GET</arg>
|
||||||
|
|
||||||
</java>
|
</java>
|
||||||
|
@ -91,11 +111,12 @@
|
||||||
|
|
||||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
||||||
|
<arg>--archiveRequestsPath</arg><arg>${archiveRequestsPath}</arg>
|
||||||
<arg>--archiveThresholdInDays</arg><arg>365</arg>
|
<arg>--archiveThresholdInDays</arg><arg>365</arg>
|
||||||
|
|
||||||
<arg>--maxNumberOfRetry</arg><arg>2</arg>
|
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
|
||||||
<arg>--requestDelay</arg><arg>0</arg>
|
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
|
||||||
<arg>--retryDelay</arg><arg>1</arg>
|
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
|
||||||
<arg>--requestMethod</arg><arg>POST</arg>
|
<arg>--requestMethod</arg><arg>POST</arg>
|
||||||
|
|
||||||
</java>
|
</java>
|
||||||
|
|
|
@ -1,29 +1,32 @@
|
||||||
package eu.dnetlib.dhp.swh;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
package eu.dnetlib.dhp.swh;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.text.ParseException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||||
|
|
||||||
public class ArchiveRepositoryURLsTest {
|
public class ArchiveRepositoryURLsTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testArchive() throws IOException {
|
void testArchive() throws IOException, ParseException {
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
|
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
File file = new File(inputPath);
|
File file = new File(inputPath);
|
||||||
FileReader fr = new FileReader(file);
|
FileReader fr = new FileReader(file);
|
||||||
BufferedReader br = new BufferedReader(fr); //creates a buffering character input stream
|
BufferedReader br = new BufferedReader(fr); // creates a buffering character input stream
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
while((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
String[] tokens = line.split("\t");
|
String[] tokens = line.split("\t");
|
||||||
|
|
||||||
String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
|
String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh;
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.net.URL;
|
|
||||||
|
|
||||||
//import org.apache.hadoop.hdfs.MiniDFSCluster;
|
//import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null}
|
||||||
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
|
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
|
||||||
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
|
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
|
||||||
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}
|
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}
|
||||||
|
|
Can't render this file because it contains an unexpected character in line 1 and column 40.
|
Loading…
Reference in New Issue