From 003c0bf179982500537f386190ef2d2482a17a6e Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 21 Mar 2023 07:19:35 +0200 Subject: [PATCH] - Add support for excluding specific datasources from being crawled. These datasources may be aggregated through bulk-imports, by other pieces of software. Such a datasource is "arXiv.org". - Fix an issue, where the "datasource-type" was retrieved instead of the "datasource-name". - Polish the "findAssignmentsQuery". --- .../services/UrlsServiceImpl.java | 72 +++++++++++++------ src/main/resources/application.yml | 7 ++ 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index 0cc7468..029b979 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -52,39 +52,65 @@ public class UrlsServiceImpl implements UrlsService { private final AtomicInteger maxAttemptsPerRecordAtomic; + private static String excludedDatasourceIDsStringList = null; + public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6); - public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord) { + public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord, + @Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List excludedIDs) { maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord); + + // The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file. + // In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty. + int exclusionListSize = excludedIDs.size(); + if ( exclusionListSize == 0 ) + return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method. + + // Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern: + // ("ID_1", "ID_2", ...) + + final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 ); + sb.append("("); + for ( int i=0; i < exclusionListSize; ++i ) { + sb.append("\"").append(excludedIDs.get(i)).append("\""); + if ( i < (exclusionListSize -1) ) + sb.append(","); + } + sb.append(")"); + + excludedDatasourceIDsStringList = sb.toString(); + //logger.debug("excludedDatasourceIDsStringList :\n" + excludedDatasourceIDsStringList); // DEBUG! } public ResponseEntity getAssignments(String workerId, int assignmentsLimit) { // Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >. - - String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" + - "from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" + - "from (\n" + - "select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" + - "from " + ImpalaConnector.databaseName + ".publication p\n" + - "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + - "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + - "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + - "on attempts.id=p.id\n" + - "left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + - "union all\n" + - "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + - "on existing.id=p.id and existing.original_url=pu.url\n" + - "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + - "\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + - "and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. - "limit " + (assignmentsLimit * 10) + - ")\nas non_distinct_results\n" + - "order by coalesce(attempt_count, 0), reverse(pubid), url\n" + - "limit " + assignmentsLimit + - "\n) as findAssignmentsQuery"; + String findAssignmentsQuery = + "select pubid, url, datasourceid, datasourcename\n" + + "from (select distinct pubid, url, datasourceid, datasourcename, attempt_count\n" + + " from (select p.id as pubid, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" + + " from " + ImpalaConnector.databaseName + ".publication p\n" + + " join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + + " join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + + " left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + + " on attempts.id=p.id\n" + + " left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + + " union all\n" + + " select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + + " on existing.id=p.id and existing.original_url=pu.url\n" + + " where d.allow_harvest=true and existing.id is null\n" + + ((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below. + (" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") + + " and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" + + " and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + + " and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. + " limit " + (assignmentsLimit * 10) + ")\n" + + " as non_distinct_results\n" + + " order by coalesce(attempt_count, 0), reverse(pubid), url\n" + + " limit " + assignmentsLimit + ")\n" + + "as findAssignmentsQuery"; // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. //logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG! diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index bd9d1ab..67a7032 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -29,6 +29,13 @@ services: shouldEmptyBucket: false shouldShowAllS3Buckets: true + datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted. + excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists. + opendoar____::6f4922f45568161a8cdf4ad2299f6d23 + + # Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values): + # First-id: arXiv.org e-Print Archive + spring: datasource: driver-class-name: com.cloudera.impala.jdbc41.Driver