Bypass url-canonicalization for urls containing certain uncommon characters which cause the urls to get rejected.

This commit is contained in:
Lampros Smyrnaios 2023-05-30 19:45:14 +03:00
parent a9b1b20a51
commit 7f3ca80959
1 changed files with 1 additions and 1 deletions

View File

@ -119,7 +119,7 @@ public class PublicationsRetrieverPlugin {
String urlToCheck = url;
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();