diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index 4baa555..8412ebe 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -66,24 +66,24 @@ public class UrlsServiceImpl implements UrlsService { String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" + "from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" + - "from (\n" + - "select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" + - "from " + ImpalaConnector.databaseName + ".publication p\n" + - "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + - "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + - "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + - "on attempts.id=p.id\n" + - "left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + - "union all\n" + - "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + - "on existing.id=p.id and existing.original_url=pu.url\n" + - "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + - "\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + - "and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. - "limit " + (assignmentsLimit * 10) + - ")\nas non_distinct_results\n" + - "order by coalesce(attempt_count, 0), reverse(pubid), url\n" + - "limit " + assignmentsLimit + + "from (\n" + + "select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" + + "from " + ImpalaConnector.databaseName + ".publication p\n" + + "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + + "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + + "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + + "on attempts.id=p.id\n" + + "left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + + "union all\n" + + "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + + "on existing.id=p.id and existing.original_url=pu.url\n" + + "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + + "\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + + "and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. + "limit " + (assignmentsLimit * 10) + + ")\nas non_distinct_results\n" + + "order by coalesce(attempt_count, 0), reverse(pubid), url\n" + + "limit " + assignmentsLimit + "\n) as findAssignmentsQuery"; // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index d4ecaed..b254edd 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -180,12 +180,12 @@ public class FileUtils { // Extract the "fileNameWithExtension" to be added in the HashMultimap. Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation); if ( ! matcher.matches() ) { - logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\", using this regex: " + FILENAME_ID_EXTENSION); + logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION); return null; } String fileNameWithExtension = matcher.group(1); if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { - logger.error("Failed to extract the \"fileNameWithExtension\" from \"" + fileLocation + "\"."); + logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION); return null; }