diff --git a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java index 864cffd..e8f6dbd 100644 --- a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java @@ -119,7 +119,7 @@ public class PublicationsRetrieverPlugin { String urlToCheck = url; String sourceUrl = urlToCheck; // Hold it here for the logging-messages. - if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) { + if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) { logger.warn("Could not canonicalize url: " + sourceUrl); UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null); LoaderAndChecker.connProblematicUrls.incrementAndGet();