Bypass url-canonicalization for urls containing certain uncommon characters which cause the urls to get rejected.
This commit is contained in:
parent
a9b1b20a51
commit
7f3ca80959
|
@ -119,7 +119,7 @@ public class PublicationsRetrieverPlugin {
|
|||
|
||||
String urlToCheck = url;
|
||||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
|
||||
if ( !UrlUtils.URL_ACCEPTED_CHARS_TO_AVOID_CANONICALIZATION.matcher(sourceUrl).matches() && ((urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null) ) {
|
||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
|
|
Loading…
Reference in New Issue