- Make use of the new Normalizer utilized by the PublicationRetriever plugin.

- Code polishing.
This commit is contained in:
Lampros Smyrnaios 2023-06-10 02:40:45 +03:00
parent 2aedae2367
commit 9c897b8bf4
4 changed files with 5 additions and 9 deletions

View File

@ -86,9 +86,6 @@ public class AssignmentsHandler {
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
requestUrl = this.controllerBaseUrl + (askForTest ? "test/" : "") + "urls?workerId=" + this.workerId + "&workerAssignmentsLimit=" + this.maxAssignmentsLimitPerBatch;
long durationInHours = (long) Math.ceil((double) this.maxAssignmentsLimitPerBatch / 1000); // For example, for 10_000 assignments we wait at most 10 hours.
logger.debug("Setting the max-connection duration for the \"post-worker-report\" to " + durationInHours + " hours.");
if ( !workerReportsDirPath.endsWith("/") )
workerReportsDirPath += "/";

View File

@ -69,7 +69,7 @@ public class ScheduledTasks {
AssignmentsHandler.shouldNotRequestMore = true;
return;
}
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). Will avoid to get new assignments for the next 15 minutes.");
logger.warn("The free space is running out (less than " + (requiredFreeSpace / (1024 * 1024)) + " Mb). The Worker will avoid getting new assignments for the next 15 minutes.");
try {
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the disk-space to be freed.
} catch (InterruptedException ie) {

View File

@ -1,6 +1,5 @@
package eu.openaire.urls_worker.components.plugins;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
@ -118,8 +117,8 @@ public class PublicationsRetrieverPlugin {
String urlToCheck = url;
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
if ( (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(sourceUrl)) == null ) {
logger.warn("Could not normalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;

View File

@ -125,8 +125,8 @@ public class GeneralController {
logger.error(initMsg + "The \"HttpServletRequest\" is null!");
return ResponseEntity.internalServerError().build();
}
String remoteAddr = request.getHeader("X-FORWARDED-FOR");
if ( remoteAddr == null || remoteAddr.isEmpty() )
String remoteAddr = request.getHeader("X-FORWARDED-FOR"); // This retrieves the original IP address, if the request passes through a proxy server.
if ( remoteAddr == null )
remoteAddr = request.getRemoteAddr();
if ( ! (remoteAddr.equals("127.0.0.1") || remoteAddr.equals(UriBuilder.ip) || remoteAddr.equals(controllerIp)) ) {