forked from antonis.lempesis/dnet-hadoop
Add step for archiving repoUrls to SWH
This commit is contained in:
parent
ed9c81a0b7
commit
ab0d70691c
|
@ -53,9 +53,9 @@ public class HttpClientParams {
|
|||
*/
|
||||
private String requestMethod;
|
||||
|
||||
|
||||
public HttpClientParams() {
|
||||
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod);
|
||||
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
|
||||
_requestMethod);
|
||||
}
|
||||
|
||||
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.swh.models.LastVisitData;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.Date;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.dhp.swh.models.LastVisitData;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
|
||||
/**
|
||||
* Sends archive requests to the SWH API for those software repository URLs that are missing from them
|
||||
|
@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs {
|
|||
|
||||
}
|
||||
|
||||
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException {
|
||||
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays)
|
||||
throws IOException {
|
||||
|
||||
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
|
||||
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
|
||||
|
@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs {
|
|||
// Read key-value pairs from the SequenceFile and handle appropriately
|
||||
while (fr.next(repoUrl, lastVisitData)) {
|
||||
|
||||
String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
|
||||
String response = null;
|
||||
try {
|
||||
response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
|
||||
} catch (java.text.ParseException e) {
|
||||
log.error("Could not handle record with repo Url: {}", repoUrl.toString());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// response is equal to null when no need for request
|
||||
if (response != null) {
|
||||
|
@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs {
|
|||
fr.close();
|
||||
}
|
||||
|
||||
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException {
|
||||
System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData);
|
||||
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays)
|
||||
throws IOException, java.text.ParseException {
|
||||
|
||||
log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData);
|
||||
|
||||
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
|
||||
|
||||
// perform an archive request when no repoUrl was not found in previous step
|
||||
// a previous attempt for archival has been made, and repository URL was not found
|
||||
// avoid performing the same archive request again
|
||||
if (lastVisit.getType() != null &&
|
||||
lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) {
|
||||
|
||||
log.info("Avoid request -- previous archive request returned NOT_FOUND");
|
||||
return null;
|
||||
}
|
||||
|
||||
// if we have last visit data
|
||||
if (lastVisit.getSnapshot() != null) {
|
||||
|
||||
// OR last visit was before (now() - archiveThresholdInDays)
|
||||
long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime());
|
||||
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
|
||||
String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate());
|
||||
|
||||
if (archiveThresholdInDays >= diffInDays) {
|
||||
return null;
|
||||
// and the last visit date can be parsed
|
||||
if (cleanDate != null) {
|
||||
|
||||
SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT);
|
||||
Date lastVisitDate = formatter.parse(cleanDate);
|
||||
|
||||
// OR last visit time < (now() - archiveThresholdInDays)
|
||||
long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime());
|
||||
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
|
||||
log.info("Date diff from now (in days): {}", diffInDays);
|
||||
|
||||
// do not perform a request, if the last visit date is no older than $archiveThresholdInDays
|
||||
if (archiveThresholdInDays >= diffInDays) {
|
||||
log.info("Avoid request -- no older than {} days", archiveThresholdInDays);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if last visit data are available, re-use version control type, else use the default one (i.e., git)
|
||||
// ELSE perform an archive request
|
||||
log.info("Perform archive request for: {}", repoUrl);
|
||||
|
||||
// if last visit data are available, re-use version control type,
|
||||
// else use the default one (i.e., git)
|
||||
String visitType = Optional
|
||||
.ofNullable(lastVisit.getType())
|
||||
.orElse(SWHConstants.DEFAULT_VISIT_TYPE);
|
||||
|
||||
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
|
||||
System.out.println(url.toString());
|
||||
|
||||
log.info("Sending archive request: {}", url);
|
||||
|
||||
String response;
|
||||
try {
|
||||
response = swhConnection.call(url.toString());
|
||||
} catch (CollectorException e) {
|
||||
log.info("Error in request: {}", url);
|
||||
log.error("Error in request: {}", url);
|
||||
response = "{}";
|
||||
}
|
||||
|
||||
return response;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
|
@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
|
||||
/**
|
||||
* Given a file with software repository URLs, this class
|
||||
|
@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData {
|
|||
try {
|
||||
response = swhConnection.call(url.toString());
|
||||
} catch (CollectorException e) {
|
||||
log.info("Error in request: {}", url);
|
||||
log.error("Error in request: {}", url);
|
||||
response = "{}";
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Collects unique software repository URLs in the Graph using Hive
|
||||
|
@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable {
|
|||
"WHERE coderepositoryurl.value IS NOT NULL " +
|
||||
"AND datainfo.deletedbyinference = FALSE " +
|
||||
"AND datainfo.invisible = FALSE " +
|
||||
"LIMIT 1000"; // TODO remove
|
||||
"LIMIT 1000";
|
||||
String query = String.format(queryTemplate, hiveDbName);
|
||||
|
||||
log.info("Hive query to fetch software code URLs: {}", query);
|
||||
|
|
|
@ -1,21 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh.models;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat;
|
||||
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||
public class LastVisitData {
|
||||
|
||||
private String type;
|
||||
|
||||
private Date date;
|
||||
private String date;
|
||||
|
||||
@JsonProperty("snapshot")
|
||||
private String snapshotId;
|
||||
|
||||
private String status;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
@ -24,11 +26,11 @@ public class LastVisitData {
|
|||
this.type = type;
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(Date date) {
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
|
@ -39,4 +41,12 @@ public class LastVisitData {
|
|||
public void setSnapshot(String snapshotId) {
|
||||
this.snapshotId = snapshotId;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(String status) {
|
||||
this.status = status;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,8 +6,10 @@ public class SWHConstants {
|
|||
|
||||
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
|
||||
|
||||
public static final String ACCESS_TOKEN = "";
|
||||
public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs";
|
||||
|
||||
public static final String DEFAULT_VISIT_TYPE = "git";
|
||||
|
||||
public static final String VISIT_STATUS_NOT_FOUND = "not_found";
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh.utils;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
public class SWHUtils {
|
||||
|
||||
|
@ -51,10 +52,10 @@ public class SWHUtils {
|
|||
log.info("retryDelay is {}", clientParams.getRetryDelay());
|
||||
|
||||
clientParams
|
||||
.setRequestMethod(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(REQUEST_METHOD))
|
||||
.orElse(HttpClientParams._requestMethod));
|
||||
.setRequestMethod(
|
||||
Optional
|
||||
.ofNullable(argumentParser.get(REQUEST_METHOD))
|
||||
.orElse(HttpClientParams._requestMethod));
|
||||
log.info("requestMethod is {}", clientParams.getRequestMethod());
|
||||
|
||||
return clientParams;
|
||||
|
@ -63,16 +64,16 @@ public class SWHUtils {
|
|||
public static BufferedReader getFileReader(FileSystem fs, Path inputPath) throws IOException {
|
||||
FSDataInputStream inputStream = fs.open(inputPath);
|
||||
return new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8));
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
public static SequenceFile.Writer getSequenceFileWriter(FileSystem fs, String outputPath) throws IOException {
|
||||
return SequenceFile
|
||||
.createWriter(
|
||||
fs.getConf(),
|
||||
SequenceFile.Writer.file(new Path(outputPath)),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class));
|
||||
.createWriter(
|
||||
fs.getConf(),
|
||||
SequenceFile.Writer.file(new Path(outputPath)),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
public static SequenceFile.Reader getSequenceFileReader(FileSystem fs, String inputPath) throws IOException {
|
||||
|
|
|
@ -11,12 +11,36 @@
|
|||
"paramDescription": "the URL where to store last visits data",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "arp",
|
||||
"paramLongName": "archiveRequestsPath",
|
||||
"paramDescription": "the URL where to store the responses of the archive requests",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mnr",
|
||||
"paramLongName": "maxNumberOfRetry",
|
||||
"paramDescription": "the maximum number of admitted connection retries",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rqd",
|
||||
"paramLongName": "requestDelay",
|
||||
"paramDescription": "the delay (ms) between requests",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rtd",
|
||||
"paramLongName": "retryDelay",
|
||||
"paramDescription": "the delay (ms) between retries",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rm",
|
||||
"paramLongName": "requestMethod",
|
||||
"paramDescription": "the method of the requests to perform",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "atid",
|
||||
"paramLongName": "archiveThresholdInDays",
|
||||
|
|
|
@ -34,5 +34,11 @@
|
|||
"paramLongName": "retryDelay",
|
||||
"paramDescription": "the delay (ms) between retries",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rm",
|
||||
"paramLongName": "requestMethod",
|
||||
"paramDescription": "the method of the requests to perform",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv
|
|||
lastVisitsPath=${workingDir}/2_last_visits.seq
|
||||
archiveRequestsPath=${workingDir}/3_archive_requests.seq
|
||||
|
||||
maxNumberOfRetry=2
|
||||
retryDelay=1
|
||||
requestDelay=100
|
||||
|
||||
resume=collect-software-repository-urls
|
||||
|
|
|
@ -8,7 +8,27 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>softwareCodeRepositoryURLs</name>
|
||||
<description>The path in the HDSF to save the software repository URLs</description>
|
||||
<description>The path in the HDFS to save the software repository URLs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>lastVisitsPath</name>
|
||||
<description>The path in the HDFS to save the responses of the last visit requests</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>archiveRequestsPath</name>
|
||||
<description>The path in the HDFS to save the responses of the archive requests</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxNumberOfRetry</name>
|
||||
<description>Max number of retries for failed API calls</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>retryDelay</name>
|
||||
<description>Retry delay for failed requests (in sec)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>requestDelay</name>
|
||||
<description>Delay between API requests (in ms)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resume</name>
|
||||
|
@ -75,9 +95,9 @@
|
|||
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
||||
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
||||
|
||||
<arg>--maxNumberOfRetry</arg><arg>2</arg>
|
||||
<arg>--requestDelay</arg><arg>0</arg>
|
||||
<arg>--retryDelay</arg><arg>1</arg>
|
||||
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
|
||||
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
|
||||
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
|
||||
<arg>--requestMethod</arg><arg>GET</arg>
|
||||
|
||||
</java>
|
||||
|
@ -91,11 +111,12 @@
|
|||
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
|
||||
<arg>--archiveRequestsPath</arg><arg>${archiveRequestsPath}</arg>
|
||||
<arg>--archiveThresholdInDays</arg><arg>365</arg>
|
||||
|
||||
<arg>--maxNumberOfRetry</arg><arg>2</arg>
|
||||
<arg>--requestDelay</arg><arg>0</arg>
|
||||
<arg>--retryDelay</arg><arg>1</arg>
|
||||
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
|
||||
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
|
||||
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
|
||||
<arg>--requestMethod</arg><arg>POST</arg>
|
||||
|
||||
</java>
|
||||
|
|
|
@ -1,35 +1,38 @@
|
|||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.junit.jupiter.api.Test;
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
|
||||
public class ArchiveRepositoryURLsTest {
|
||||
|
||||
@Test
|
||||
void testArchive() throws IOException {
|
||||
String inputPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
|
||||
.getPath();
|
||||
@Test
|
||||
void testArchive() throws IOException, ParseException {
|
||||
String inputPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
|
||||
.getPath();
|
||||
|
||||
File file = new File(inputPath);
|
||||
FileReader fr = new FileReader(file);
|
||||
BufferedReader br = new BufferedReader(fr); //creates a buffering character input stream
|
||||
File file = new File(inputPath);
|
||||
FileReader fr = new FileReader(file);
|
||||
BufferedReader br = new BufferedReader(fr); // creates a buffering character input stream
|
||||
|
||||
String line;
|
||||
while((line = br.readLine()) != null) {
|
||||
String[] tokens = line.split("\t");
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] tokens = line.split("\t");
|
||||
|
||||
String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
|
||||
System.out.println(tokens[0] + "\t" + response);
|
||||
System.out.println();
|
||||
}
|
||||
fr.close();
|
||||
}
|
||||
String response = ArchiveRepositoryURLs.handleRecord(tokens[0], tokens[1], 365);
|
||||
System.out.println(tokens[0] + "\t" + response);
|
||||
System.out.println();
|
||||
}
|
||||
fr.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
||||
//import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
|
||||
|
@ -24,7 +25,7 @@ public class SWHConnectionTest {
|
|||
HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams.setRequestMethod("GET");
|
||||
|
||||
SWHConnection swhConnection = new SWHConnection(clientParams);
|
||||
SWHConnection swhConnection = new SWHConnection(clientParams);
|
||||
|
||||
String repoUrl = "https://github.com/stanford-futuredata/FAST";
|
||||
URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl));
|
||||
|
@ -42,7 +43,7 @@ public class SWHConnectionTest {
|
|||
HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams.setRequestMethod("POST");
|
||||
|
||||
SWHConnection swhConnection = new SWHConnection(clientParams);
|
||||
SWHConnection swhConnection = new SWHConnection(clientParams);
|
||||
|
||||
String repoUrl = "https://github.com/stanford-futuredata/FAST";
|
||||
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, SWHConstants.DEFAULT_VISIT_TYPE, repoUrl));
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null}
|
||||
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
|
||||
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
|
||||
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}
|
||||
|
|
Can't render this file because it contains an unexpected character in line 1 and column 40.
|
Loading…
Reference in New Issue