|
|
|
@ -1,24 +1,17 @@
|
|
|
|
|
|
|
|
|
|
package eu.dnetlib.dhp.collection.worker.utils;
|
|
|
|
|
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
import java.net.*;
|
|
|
|
|
import java.security.GeneralSecurityException;
|
|
|
|
|
import java.security.cert.X509Certificate;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
|
|
import javax.net.ssl.HttpsURLConnection;
|
|
|
|
|
import javax.net.ssl.SSLContext;
|
|
|
|
|
import javax.net.ssl.TrustManager;
|
|
|
|
|
import javax.net.ssl.X509TrustManager;
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
|
import org.apache.commons.lang3.math.NumberUtils;
|
|
|
|
|
import org.apache.commons.logging.Log;
|
|
|
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
|
|
import org.apache.http.HttpHeaders;
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
|
|
|
|
|
|
|
|
@ -29,162 +22,151 @@ import eu.dnetlib.dhp.collection.worker.CollectorException;
|
|
|
|
|
*/
|
|
|
|
|
public class HttpConnector2 {
|
|
|
|
|
|
|
|
|
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
|
|
|
|
|
|
|
|
|
|
private static final String REPORT_PREFIX = "http:";
|
|
|
|
|
|
|
|
|
|
private int maxNumberOfRetry = 6;
|
|
|
|
|
private int defaultDelay = 120; // seconds
|
|
|
|
|
private int readTimeOut = 120; // seconds
|
|
|
|
|
private HttpClientParams clientParams;
|
|
|
|
|
|
|
|
|
|
private String responseType = null;
|
|
|
|
|
|
|
|
|
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
|
|
|
|
|
|
|
|
|
public HttpConnector2() {
|
|
|
|
|
this(new HttpClientParams());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public HttpConnector2(HttpClientParams clientParams) {
|
|
|
|
|
this.clientParams = clientParams;
|
|
|
|
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList)
|
|
|
|
|
* @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport)
|
|
|
|
|
*/
|
|
|
|
|
public String getInputSource(final String requestUrl) throws CollectorException {
|
|
|
|
|
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
|
|
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
|
|
|
|
|
return IOUtils.toInputStream(getInputSource(requestUrl));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList)
|
|
|
|
|
* @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport)
|
|
|
|
|
*/
|
|
|
|
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
|
|
|
|
|
return IOUtils.toInputStream(getInputSource(requestUrl));
|
|
|
|
|
public String getInputSource(final String requestUrl) throws CollectorException {
|
|
|
|
|
return attemptDownloadAsString(requestUrl, 1, new CollectorPluginReport());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Given the URL returns the content via HTTP GET
|
|
|
|
|
*
|
|
|
|
|
* @param requestUrl the URL
|
|
|
|
|
* @param errorLogList the list of errors
|
|
|
|
|
* @param report the list of errors
|
|
|
|
|
* @return the content of the downloaded resource
|
|
|
|
|
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
|
|
|
|
*/
|
|
|
|
|
public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList)
|
|
|
|
|
public String getInputSource(final String requestUrl, CollectorPluginReport report)
|
|
|
|
|
throws CollectorException {
|
|
|
|
|
return attemptDownlaodAsString(requestUrl, 1, errorLogList);
|
|
|
|
|
return attemptDownloadAsString(requestUrl, 1, report);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
|
|
|
|
|
final CollectorPluginErrorLogList errorList)
|
|
|
|
|
throws CollectorException {
|
|
|
|
|
try {
|
|
|
|
|
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
|
|
|
try {
|
|
|
|
|
return IOUtils.toString(s);
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
|
|
|
|
Thread.sleep(defaultDelay * 1000);
|
|
|
|
|
errorList.add(e.getMessage());
|
|
|
|
|
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
|
|
|
|
} finally {
|
|
|
|
|
IOUtils.closeQuietly(s);
|
|
|
|
|
}
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
|
|
|
|
|
final CollectorPluginReport report) throws CollectorException {
|
|
|
|
|
|
|
|
|
|
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
|
|
|
|
|
return IOUtils.toString(s);
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
log.error(e.getMessage(), e);
|
|
|
|
|
throw new CollectorException(e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
|
|
|
|
final CollectorPluginErrorLogList errorList)
|
|
|
|
|
throws CollectorException {
|
|
|
|
|
final CollectorPluginReport report) throws CollectorException, IOException {
|
|
|
|
|
|
|
|
|
|
if (retryNumber > maxNumberOfRetry) {
|
|
|
|
|
throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList);
|
|
|
|
|
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
|
|
|
|
|
throw new CollectorException("Max number of retries exceeded. Cause: \n " + report);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
|
|
|
|
try {
|
|
|
|
|
InputStream input = null;
|
|
|
|
|
log.info("Downloading attempt {} [{}]", retryNumber, requestUrl);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
|
|
|
|
urlConn.setInstanceFollowRedirects(false);
|
|
|
|
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
|
|
|
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
|
|
|
|
InputStream input = null;
|
|
|
|
|
|
|
|
|
|
if (log.isDebugEnabled()) {
|
|
|
|
|
logHeaderFields(urlConn);
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
|
|
|
|
urlConn.setInstanceFollowRedirects(false);
|
|
|
|
|
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
|
|
|
|
|
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
|
|
|
|
|
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
|
|
|
|
|
|
|
|
|
|
if (log.isDebugEnabled()) {
|
|
|
|
|
logHeaderFields(urlConn);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
|
|
|
|
if (is2xx(urlConn.getResponseCode())) {
|
|
|
|
|
input = urlConn.getInputStream();
|
|
|
|
|
responseType = urlConn.getContentType();
|
|
|
|
|
return input;
|
|
|
|
|
}
|
|
|
|
|
if (is3xx(urlConn.getResponseCode())) {
|
|
|
|
|
// REDIRECTS
|
|
|
|
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
|
|
|
|
log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl));
|
|
|
|
|
errorList
|
|
|
|
|
.add(
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(),
|
|
|
|
|
urlConn.getResponseMessage(), newUrl));
|
|
|
|
|
urlConn.disconnect();
|
|
|
|
|
if (retryAfter > 0)
|
|
|
|
|
Thread.sleep(retryAfter * 1000);
|
|
|
|
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
|
|
|
|
}
|
|
|
|
|
if (is4xx(urlConn.getResponseCode())) {
|
|
|
|
|
// CLIENT ERROR, DO NOT RETRY
|
|
|
|
|
errorList
|
|
|
|
|
.add(
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"%s error %s: %s", requestUrl, urlConn.getResponseCode(),
|
|
|
|
|
urlConn.getResponseMessage()));
|
|
|
|
|
throw new CollectorException("4xx error: request will not be repeated. " + errorList);
|
|
|
|
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
|
|
|
|
if (is2xx(urlConn.getResponseCode())) {
|
|
|
|
|
input = urlConn.getInputStream();
|
|
|
|
|
responseType = urlConn.getContentType();
|
|
|
|
|
return input;
|
|
|
|
|
}
|
|
|
|
|
if (is3xx(urlConn.getResponseCode())) {
|
|
|
|
|
// REDIRECTS
|
|
|
|
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
|
|
|
|
log.info(String.format("The requested url has been moved to %s", newUrl));
|
|
|
|
|
report
|
|
|
|
|
.put(
|
|
|
|
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
|
|
|
|
String.format("Moved to: %s", newUrl));
|
|
|
|
|
urlConn.disconnect();
|
|
|
|
|
if (retryAfter > 0) {
|
|
|
|
|
backoffAndSleep(retryAfter);
|
|
|
|
|
}
|
|
|
|
|
if (is5xx(urlConn.getResponseCode())) {
|
|
|
|
|
// SERVER SIDE ERRORS RETRY ONLY on 503
|
|
|
|
|
switch (urlConn.getResponseCode()) {
|
|
|
|
|
case HttpURLConnection.HTTP_UNAVAILABLE:
|
|
|
|
|
if (retryAfter > 0) {
|
|
|
|
|
log
|
|
|
|
|
.warn(
|
|
|
|
|
requestUrl + " - waiting and repeating request after suggested retry-after "
|
|
|
|
|
+ retryAfter + " sec.");
|
|
|
|
|
Thread.sleep(retryAfter * 1000);
|
|
|
|
|
} else {
|
|
|
|
|
log
|
|
|
|
|
.warn(
|
|
|
|
|
requestUrl + " - waiting and repeating request after default delay of "
|
|
|
|
|
+ defaultDelay + " sec.");
|
|
|
|
|
Thread.sleep(defaultDelay * 1000);
|
|
|
|
|
}
|
|
|
|
|
errorList.add(requestUrl + " 503 Service Unavailable");
|
|
|
|
|
urlConn.disconnect();
|
|
|
|
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
|
|
|
default:
|
|
|
|
|
errorList
|
|
|
|
|
.add(
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"%s Error %s: %s", requestUrl, urlConn.getResponseCode(),
|
|
|
|
|
urlConn.getResponseMessage()));
|
|
|
|
|
throw new CollectorException(urlConn.getResponseCode() + " error " + errorList);
|
|
|
|
|
}
|
|
|
|
|
return attemptDownload(newUrl, retryNumber + 1, report);
|
|
|
|
|
}
|
|
|
|
|
if (is4xx(urlConn.getResponseCode())) {
|
|
|
|
|
// CLIENT ERROR, DO NOT RETRY
|
|
|
|
|
report
|
|
|
|
|
.put(
|
|
|
|
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
|
|
|
|
|
throw new CollectorException("4xx error: request will not be repeated. " + report);
|
|
|
|
|
}
|
|
|
|
|
if (is5xx(urlConn.getResponseCode())) {
|
|
|
|
|
// SERVER SIDE ERRORS RETRY ONLY on 503
|
|
|
|
|
switch (urlConn.getResponseCode()) {
|
|
|
|
|
case HttpURLConnection.HTTP_UNAVAILABLE:
|
|
|
|
|
if (retryAfter > 0) {
|
|
|
|
|
log
|
|
|
|
|
.warn(
|
|
|
|
|
requestUrl + " - waiting and repeating request after suggested retry-after "
|
|
|
|
|
+ retryAfter + " sec.");
|
|
|
|
|
backoffAndSleep(retryAfter * 1000);
|
|
|
|
|
} else {
|
|
|
|
|
log
|
|
|
|
|
.warn(
|
|
|
|
|
requestUrl + " - waiting and repeating request after default delay of "
|
|
|
|
|
+ getClientParams().getRetryDelay() + " sec.");
|
|
|
|
|
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
|
|
|
|
|
}
|
|
|
|
|
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
|
|
|
|
|
urlConn.disconnect();
|
|
|
|
|
return attemptDownload(requestUrl, retryNumber + 1, report);
|
|
|
|
|
default:
|
|
|
|
|
report
|
|
|
|
|
.put(
|
|
|
|
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
|
|
|
|
|
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
|
|
|
|
|
}
|
|
|
|
|
throw new CollectorException(
|
|
|
|
|
String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), errorList));
|
|
|
|
|
} catch (MalformedURLException | NoRouteToHostException e) {
|
|
|
|
|
errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl));
|
|
|
|
|
throw new CollectorException(e + "error " + errorList);
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
Thread.sleep(defaultDelay * 1000);
|
|
|
|
|
errorList.add(requestUrl + " " + e.getMessage());
|
|
|
|
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
|
|
|
}
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
throw new CollectorException(e);
|
|
|
|
|
throw new CollectorException(
|
|
|
|
|
String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), report));
|
|
|
|
|
} catch (MalformedURLException | SocketException | UnknownHostException e) {
|
|
|
|
|
log.error(e.getMessage(), e);
|
|
|
|
|
report.put(e.getClass().getName(), e.getMessage());
|
|
|
|
|
throw new CollectorException(e.getMessage(), e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -200,12 +182,21 @@ public class HttpConnector2 {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void backoffAndSleep(int sleepTime) throws CollectorException {
|
|
|
|
|
log.info("I'm going to sleep for {}ms", sleepTime);
|
|
|
|
|
try {
|
|
|
|
|
Thread.sleep(sleepTime);
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
log.error(e.getMessage(), e);
|
|
|
|
|
throw new CollectorException(e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
|
|
|
|
for (String key : headerMap.keySet()) {
|
|
|
|
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
|
|
|
|
|
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
|
|
|
|
|
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
|
|
|
|
return Integer
|
|
|
|
|
.parseInt(headerMap.get(key).get(0)) + 10;
|
|
|
|
|
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
@ -213,44 +204,13 @@ public class HttpConnector2 {
|
|
|
|
|
|
|
|
|
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
|
|
|
|
|
for (String key : headerMap.keySet()) {
|
|
|
|
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
|
|
|
|
|
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
|
|
|
|
|
return headerMap.get(key).get(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
|
|
|
|
*/
|
|
|
|
|
public void initTrustManager() {
|
|
|
|
|
final X509TrustManager tm = new X509TrustManager() {
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public X509Certificate[] getAcceptedIssuers() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
try {
|
|
|
|
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
|
|
|
|
ctx.init(null, new TrustManager[] {
|
|
|
|
|
tm
|
|
|
|
|
}, null);
|
|
|
|
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
|
|
|
|
} catch (GeneralSecurityException e) {
|
|
|
|
|
log.fatal(e);
|
|
|
|
|
throw new IllegalStateException(e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private boolean is2xx(final int statusCode) {
|
|
|
|
|
return statusCode >= 200 && statusCode <= 299;
|
|
|
|
|
}
|
|
|
|
@ -267,32 +227,15 @@ public class HttpConnector2 {
|
|
|
|
|
return statusCode >= 500 && statusCode <= 599;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getMaxNumberOfRetry() {
|
|
|
|
|
return maxNumberOfRetry;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
|
|
|
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getDefaultDelay() {
|
|
|
|
|
return defaultDelay;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setDefaultDelay(final int defaultDelay) {
|
|
|
|
|
this.defaultDelay = defaultDelay;
|
|
|
|
|
public String getResponseType() {
|
|
|
|
|
return responseType;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getReadTimeOut() {
|
|
|
|
|
return readTimeOut;
|
|
|
|
|
public HttpClientParams getClientParams() {
|
|
|
|
|
return clientParams;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setReadTimeOut(final int readTimeOut) {
|
|
|
|
|
this.readTimeOut = readTimeOut;
|
|
|
|
|
public void setClientParams(HttpClientParams clientParams) {
|
|
|
|
|
this.clientParams = clientParams;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getResponseType() {
|
|
|
|
|
return responseType;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|