Add step for archiving repoUrls to SWH

This commit is contained in:
Serafeim Chatzopoulos 2023-09-28 20:56:18 +03:00
parent ed9c81a0b7
commit ab0d70691c
14 changed files with 230 additions and 118 deletions

View File

@ -53,9 +53,9 @@ public class HttpClientParams {
*/
private String requestMethod;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(), _requestMethod);
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
_requestMethod);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,

View File

@ -1,14 +1,16 @@
package eu.dnetlib.dhp.swh;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.models.LastVisitData;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
@ -17,14 +19,17 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import com.fasterxml.jackson.databind.ObjectMapper;
import static eu.dnetlib.dhp.common.Constants.REQUEST_METHOD;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.swh.models.LastVisitData;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
/**
* Sends archive requests to the SWH API for those software repository URLs that are missing from them
@ -69,7 +74,8 @@ public class ArchiveRepositoryURLs {
}
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays) throws IOException {
private static void archive(FileSystem fs, String inputPath, String outputPath, Integer archiveThresholdInDays)
throws IOException {
SequenceFile.Reader fr = SWHUtils.getSequenceFileReader(fs, inputPath);
SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath);
@ -81,7 +87,13 @@ public class ArchiveRepositoryURLs {
// Read key-value pairs from the SequenceFile and handle appropriately
while (fr.next(repoUrl, lastVisitData)) {
String response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
String response = null;
try {
response = handleRecord(repoUrl.toString(), lastVisitData.toString(), archiveThresholdInDays);
} catch (java.text.ParseException e) {
log.error("Could not handle record with repo Url: {}", repoUrl.toString());
throw new RuntimeException(e);
}
// response is equal to null when no need for request
if (response != null) {
@ -95,43 +107,68 @@ public class ArchiveRepositoryURLs {
fr.close();
}
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays) throws IOException {
System.out.println("Key: " + repoUrl + ", Value: " + lastVisitData);
public static String handleRecord(String repoUrl, String lastVisitData, Integer archiveThresholdInDays)
throws IOException, java.text.ParseException {
log.info("{ Key: {}, Value: {} }", repoUrl, lastVisitData);
LastVisitData lastVisit = OBJECT_MAPPER.readValue(lastVisitData, LastVisitData.class);
// perform an archive request when no repoUrl was not found in previous step
// a previous attempt for archival has been made, and repository URL was not found
// avoid performing the same archive request again
if (lastVisit.getType() != null &&
lastVisit.getType().equals(SWHConstants.VISIT_STATUS_NOT_FOUND)) {
log.info("Avoid request -- previous archive request returned NOT_FOUND");
return null;
}
// if we have last visit data
if (lastVisit.getSnapshot() != null) {
// OR last visit was before (now() - archiveThresholdInDays)
long diffInMillies = Math.abs((new Date()).getTime() - lastVisit.getDate().getTime());
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
String cleanDate = GraphCleaningFunctions.cleanDate(lastVisit.getDate());
// and the last visit date can be parsed
if (cleanDate != null) {
SimpleDateFormat formatter = new SimpleDateFormat(ModelSupport.DATE_FORMAT);
Date lastVisitDate = formatter.parse(cleanDate);
// OR last visit time < (now() - archiveThresholdInDays)
long diffInMillies = Math.abs((new Date()).getTime() - lastVisitDate.getTime());
long diffInDays = TimeUnit.DAYS.convert(diffInMillies, TimeUnit.MILLISECONDS);
log.info("Date diff from now (in days): {}", diffInDays);
// do not perform a request, if the last visit date is no older than $archiveThresholdInDays
if (archiveThresholdInDays >= diffInDays) {
log.info("Avoid request -- no older than {} days", archiveThresholdInDays);
return null;
}
}
}
// if last visit data are available, re-use version control type, else use the default one (i.e., git)
// ELSE perform an archive request
log.info("Perform archive request for: {}", repoUrl);
// if last visit data are available, re-use version control type,
// else use the default one (i.e., git)
String visitType = Optional
.ofNullable(lastVisit.getType())
.orElse(SWHConstants.DEFAULT_VISIT_TYPE);
URL url = new URL(String.format(SWHConstants.SWH_ARCHIVE_URL, visitType, repoUrl.trim()));
System.out.println(url.toString());
log.info("Sending archive request: {}", url);
String response;
try {
response = swhConnection.call(url.toString());
} catch (CollectorException e) {
log.info("Error in request: {}", url);
log.error("Error in request: {}", url);
response = "{}";
}
return response;
}
}

View File

@ -1,12 +1,15 @@
package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
@ -18,14 +21,12 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
/**
* Given a file with software repository URLs, this class
@ -107,7 +108,7 @@ public class CollectLastVisitRepositoryData {
try {
response = swhConnection.call(url.toString());
} catch (CollectorException e) {
log.info("Error in request: {}", url);
log.error("Error in request: {}", url);
response = "{}";
}

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
@ -12,10 +15,8 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Collects unique software repository URLs in the Graph using Hive
@ -69,7 +70,7 @@ public class CollectSoftwareRepositoryURLs implements Serializable {
"WHERE coderepositoryurl.value IS NOT NULL " +
"AND datainfo.deletedbyinference = FALSE " +
"AND datainfo.invisible = FALSE " +
"LIMIT 1000"; // TODO remove
"LIMIT 1000";
String query = String.format(queryTemplate, hiveDbName);
log.info("Hive query to fetch software code URLs: {}", query);

View File

@ -1,21 +1,23 @@
package eu.dnetlib.dhp.swh.models;
import java.util.Date;
import com.cloudera.com.fasterxml.jackson.annotation.JsonFormat;
import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import java.util.Date;
@JsonIgnoreProperties(ignoreUnknown = true)
public class LastVisitData {
private String type;
private Date date;
private String date;
@JsonProperty("snapshot")
private String snapshotId;
private String status;
public String getType() {
return type;
}
@ -24,11 +26,11 @@ public class LastVisitData {
this.type = type;
}
public Date getDate() {
public String getDate() {
return date;
}
public void setDate(Date date) {
public void setDate(String date) {
this.date = date;
}
@ -39,4 +41,12 @@ public class LastVisitData {
public void setSnapshot(String snapshotId) {
this.snapshotId = snapshotId;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
}

View File

@ -6,8 +6,10 @@ public class SWHConstants {
public static final String SWH_ARCHIVE_URL = "https://archive.softwareheritage.org/api/1/origin/save/%s/url/%s/";
public static final String ACCESS_TOKEN = "";
public static final String ACCESS_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtODQ2Ny05MzAyZjk3MTFkOGEifQ.eyJpYXQiOjE2OTQ2MzYwMjAsImp0aSI6IjkwZjdkNTNjLTQ5YTktNGFiMy1hY2E0LTcwMTViMjEyZTNjNiIsImlzcyI6Imh0dHBzOi8vYXV0aC5zb2Z0d2FyZWhlcml0YWdlLm9yZy9hdXRoL3JlYWxtcy9Tb2Z0d2FyZUhlcml0YWdlIiwiYXVkIjoiaHR0cHM6Ly9hdXRoLnNvZnR3YXJlaGVyaXRhZ2Uub3JnL2F1dGgvcmVhbG1zL1NvZnR3YXJlSGVyaXRhZ2UiLCJzdWIiOiIzMTY5OWZkNC0xNmE0LTQxOWItYTdhMi00NjI5MDY4ZjI3OWEiLCJ0eXAiOiJPZmZsaW5lIiwiYXpwIjoic3doLXdlYiIsInNlc3Npb25fc3RhdGUiOiIzMjYzMzEwMS00ZDRkLTQwMjItODU2NC1iMzNlMTJiNTE3ZDkiLCJzY29wZSI6Im9wZW5pZCBvZmZsaW5lX2FjY2VzcyBwcm9maWxlIGVtYWlsIn0.XHj1VIZu1dZ4Ej32-oU84mFmaox9cLNjXosNxwZM0Xs";
public static final String DEFAULT_VISIT_TYPE = "git";
public static final String VISIT_STATUS_NOT_FOUND = "not_found";
}

View File

@ -1,8 +1,14 @@
package eu.dnetlib.dhp.swh.utils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import static eu.dnetlib.dhp.common.Constants.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -11,13 +17,8 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import static eu.dnetlib.dhp.common.Constants.*;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class SWHUtils {

View File

@ -11,12 +11,36 @@
"paramDescription": "the URL where to store last visits data",
"paramRequired": true
},
{
"paramName": "arp",
"paramLongName": "archiveRequestsPath",
"paramDescription": "the URL where to store the responses of the archive requests",
"paramRequired": true
},
{
"paramName": "mnr",
"paramLongName": "maxNumberOfRetry",
"paramDescription": "the maximum number of admitted connection retries",
"paramRequired": false
},
{
"paramName": "rqd",
"paramLongName": "requestDelay",
"paramDescription": "the delay (ms) between requests",
"paramRequired": false
},
{
"paramName": "rtd",
"paramLongName": "retryDelay",
"paramDescription": "the delay (ms) between retries",
"paramRequired": false
},
{
"paramName": "rm",
"paramLongName": "requestMethod",
"paramDescription": "the method of the requests to perform",
"paramRequired": false
},
{
"paramName": "atid",
"paramLongName": "archiveThresholdInDays",

View File

@ -34,5 +34,11 @@
"paramLongName": "retryDelay",
"paramDescription": "the delay (ms) between retries",
"paramRequired": false
},
{
"paramName": "rm",
"paramLongName": "requestMethod",
"paramDescription": "the method of the requests to perform",
"paramRequired": false
}
]

View File

@ -8,4 +8,8 @@ softwareCodeRepositoryURLs=${workingDir}/1_code_repo_urls.csv
lastVisitsPath=${workingDir}/2_last_visits.seq
archiveRequestsPath=${workingDir}/3_archive_requests.seq
maxNumberOfRetry=2
retryDelay=1
requestDelay=100
resume=collect-software-repository-urls

View File

@ -8,7 +8,27 @@
</property>
<property>
<name>softwareCodeRepositoryURLs</name>
<description>The path in the HDSF to save the software repository URLs</description>
<description>The path in the HDFS to save the software repository URLs</description>
</property>
<property>
<name>lastVisitsPath</name>
<description>The path in the HDFS to save the responses of the last visit requests</description>
</property>
<property>
<name>archiveRequestsPath</name>
<description>The path in the HDFS to save the responses of the archive requests</description>
</property>
<property>
<name>maxNumberOfRetry</name>
<description>Max number of retries for failed API calls</description>
</property>
<property>
<name>retryDelay</name>
<description>Retry delay for failed requests (in sec)</description>
</property>
<property>
<name>requestDelay</name>
<description>Delay between API requests (in ms)</description>
</property>
<property>
<name>resume</name>
@ -75,9 +95,9 @@
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
<arg>--maxNumberOfRetry</arg><arg>2</arg>
<arg>--requestDelay</arg><arg>0</arg>
<arg>--retryDelay</arg><arg>1</arg>
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
<arg>--requestMethod</arg><arg>GET</arg>
</java>
@ -91,11 +111,12 @@
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--lastVisitsPath</arg><arg>${lastVisitsPath}</arg>
<arg>--archiveRequestsPath</arg><arg>${archiveRequestsPath}</arg>
<arg>--archiveThresholdInDays</arg><arg>365</arg>
<arg>--maxNumberOfRetry</arg><arg>2</arg>
<arg>--requestDelay</arg><arg>0</arg>
<arg>--retryDelay</arg><arg>1</arg>
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
<arg>--requestMethod</arg><arg>POST</arg>
</java>

View File

@ -1,19 +1,22 @@
package eu.dnetlib.dhp.swh;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.Test;
package eu.dnetlib.dhp.swh;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
public class ArchiveRepositoryURLsTest {
@Test
void testArchive() throws IOException {
void testArchive() throws IOException, ParseException {
String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/swh/lastVisitDataToArchive.csv")
.getPath();

View File

@ -1,17 +1,18 @@
package eu.dnetlib.dhp.swh;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
//import org.apache.hadoop.hdfs.MiniDFSCluster;

View File

@ -1,3 +1,4 @@
https://bitbucket.org/samskillman/yt-stokes {"origin":"https://bitbucket.org/samskillman/yt-stokes","visit":43,"date":"2021-09-13T21:59:27.125171+00:00","status":"failed","snapshot":null,"type":"hg","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/samskillman/yt-stokes/get/","snapshot_url":null}
https://github.com/bioinsilico/BIPSPI {"origin":"https://github.com/bioinsilico/BIPSPI","visit":1,"date":"2020-03-18T14:50:21.541822+00:00","status":"full","snapshot":"c6c69d2cd73ce89811448da5f031611df6f63bdb","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/bioinsilico/BIPSPI/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/c6c69d2cd73ce89811448da5f031611df6f63bdb/"}
https://github.com/mloop/kdiff-type1-error-rate/blob/master/analysis/simulation.R {}
https://github.com/schwanbeck/YSMR {"origin":"https://github.com/schwanbeck/YSMR","visit":6,"date":"2023-08-02T15:25:02.650676+00:00","status":"full","snapshot":"a9d1c5f0bca2def198b89f65bc9f7da3be8439ed","type":"git","metadata":{},"origin_url":"https://archive.softwareheritage.org/api/1/origin/https://github.com/schwanbeck/YSMR/get/","snapshot_url":"https://archive.softwareheritage.org/api/1/snapshot/a9d1c5f0bca2def198b89f65bc9f7da3be8439ed/"}

Can't render this file because it contains an unexpected character in line 1 and column 40.