- Make use of the new Normalizer utilized by the PublicationRetriever plugin.
- Code polishing.
This commit is contained in:
parent
2aedae2367
commit
9c897b8bf4
|
@ -86,9 +86,6 @@ public class AssignmentsHandler {
|
||||||
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
|
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
|
||||||
requestUrl = this.controllerBaseUrl + (askForTest ? "test/" : "") + "urls?workerId=" + this.workerId + "&workerAssignmentsLimit=" + this.maxAssignmentsLimitPerBatch;
|
requestUrl = this.controllerBaseUrl + (askForTest ? "test/" : "") + "urls?workerId=" + this.workerId + "&workerAssignmentsLimit=" + this.maxAssignmentsLimitPerBatch;
|
||||||
|
|
||||||
long durationInHours = (long) Math.ceil((double) this.maxAssignmentsLimitPerBatch / 1000); // For example, for 10_000 assignments we wait at most 10 hours.
|
|
||||||
logger.debug("Setting the max-connection duration for the \"post-worker-report\" to " + durationInHours + " hours.");
|
|
||||||
|
|
||||||
if ( !workerReportsDirPath.endsWith("/") )
|
if ( !workerReportsDirPath.endsWith("/") )
|
||||||
workerReportsDirPath += "/";
|
workerReportsDirPath += "/";
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@ public class ScheduledTasks {
|
||||||
AssignmentsHandler.shouldNotRequestMore = true;
|
AssignmentsHandler.shouldNotRequestMore = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). Will avoid to get new assignments for the next 15 minutes.");
|
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). The Worker will avoid getting new assignments for the next 15 minutes.");
|
||||||
try {
|
try {
|
||||||
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the disk-space to be freed.
|
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the disk-space to be freed.
|
||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.openaire.urls_worker.components.plugins;
|
package eu.openaire.urls_worker.components.plugins;
|
||||||
|
|
||||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
|
||||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||||
|
@ -118,8 +117,8 @@ public class PublicationsRetrieverPlugin {
|
||||||
|
|
||||||
String urlToCheck = url;
|
String urlToCheck = url;
|
||||||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||||
if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) {
|
if ( (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(sourceUrl)) == null ) {
|
||||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
logger.warn("Could not normalize url: " + sourceUrl);
|
||||||
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -125,8 +125,8 @@ public class GeneralController {
|
||||||
logger.error(initMsg + "The \"HttpServletRequest\" is null!");
|
logger.error(initMsg + "The \"HttpServletRequest\" is null!");
|
||||||
return ResponseEntity.internalServerError().build();
|
return ResponseEntity.internalServerError().build();
|
||||||
}
|
}
|
||||||
String remoteAddr = request.getHeader("X-FORWARDED-FOR");
|
String remoteAddr = request.getHeader("X-FORWARDED-FOR"); // This retrieves the original IP address, if the request passes through a proxy server.
|
||||||
if ( remoteAddr == null || remoteAddr.isEmpty() )
|
if ( remoteAddr == null )
|
||||||
remoteAddr = request.getRemoteAddr();
|
remoteAddr = request.getRemoteAddr();
|
||||||
|
|
||||||
if ( ! (remoteAddr.equals("127.0.0.1") || remoteAddr.equals(UriBuilder.ip) || remoteAddr.equals(controllerIp)) ) {
|
if ( ! (remoteAddr.equals("127.0.0.1") || remoteAddr.equals(UriBuilder.ip) || remoteAddr.equals(controllerIp)) ) {
|
||||||
|
|
Loading…
Reference in New Issue