- Make use of the new Normalizer utilized by the PublicationRetriever plugin.
- Code polishing.
This commit is contained in:
parent
2aedae2367
commit
9c897b8bf4
|
@ -86,9 +86,6 @@ public class AssignmentsHandler {
|
|||
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
|
||||
requestUrl = this.controllerBaseUrl + (askForTest ? "test/" : "") + "urls?workerId=" + this.workerId + "&workerAssignmentsLimit=" + this.maxAssignmentsLimitPerBatch;
|
||||
|
||||
long durationInHours = (long) Math.ceil((double) this.maxAssignmentsLimitPerBatch / 1000); // For example, for 10_000 assignments we wait at most 10 hours.
|
||||
logger.debug("Setting the max-connection duration for the \"post-worker-report\" to " + durationInHours + " hours.");
|
||||
|
||||
if ( !workerReportsDirPath.endsWith("/") )
|
||||
workerReportsDirPath += "/";
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ public class ScheduledTasks {
|
|||
AssignmentsHandler.shouldNotRequestMore = true;
|
||||
return;
|
||||
}
|
||||
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). Will avoid to get new assignments for the next 15 minutes.");
|
||||
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). The Worker will avoid getting new assignments for the next 15 minutes.");
|
||||
try {
|
||||
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the disk-space to be freed.
|
||||
} catch (InterruptedException ie) {
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.openaire.urls_worker.components.plugins;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||
|
@ -118,8 +117,8 @@ public class PublicationsRetrieverPlugin {
|
|||
|
||||
String urlToCheck = url;
|
||||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) {
|
||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||||
if ( (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(sourceUrl)) == null ) {
|
||||
logger.warn("Could not normalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
return false;
|
||||
|
|
|
@ -125,8 +125,8 @@ public class GeneralController {
|
|||
logger.error(initMsg + "The \"HttpServletRequest\" is null!");
|
||||
return ResponseEntity.internalServerError().build();
|
||||
}
|
||||
String remoteAddr = request.getHeader("X-FORWARDED-FOR");
|
||||
if ( remoteAddr == null || remoteAddr.isEmpty() )
|
||||
String remoteAddr = request.getHeader("X-FORWARDED-FOR"); // This retrieves the original IP address, if the request passes through a proxy server.
|
||||
if ( remoteAddr == null )
|
||||
remoteAddr = request.getRemoteAddr();
|
||||
|
||||
if ( ! (remoteAddr.equals("127.0.0.1") || remoteAddr.equals(UriBuilder.ip) || remoteAddr.equals(controllerIp)) ) {
|
||||
|
|
Loading…
Reference in New Issue