From 003c0bf179982500537f386190ef2d2482a17a6e Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Tue, 21 Mar 2023 07:19:35 +0200
Subject: [PATCH] - Add support for excluding specific datasources from being
 crawled. These datasources may be aggregated through bulk-imports, by other
 pieces of software. Such a datasource is "arXiv.org". - Fix an issue, where
 the "datasource-type" was retrieved instead of the "datasource-name". -
 Polish the "findAssignmentsQuery".

---
 .../services/UrlsServiceImpl.java             | 72 +++++++++++++------
 src/main/resources/application.yml            |  7 ++
 2 files changed, 56 insertions(+), 23 deletions(-)
diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
index 0cc7468..029b979 100644
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@@ -52,39 +52,65 @@ public class UrlsServiceImpl implements UrlsService {
 
     private final AtomicInteger maxAttemptsPerRecordAtomic;
 
+    private static String excludedDatasourceIDsStringList = null;
+
     public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
 
 
-    public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord) {
+    public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord,
+                           @Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List<String> excludedIDs) {
         maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);
+
+        // The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file.
+        // In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty.
+        int exclusionListSize = excludedIDs.size();
+        if ( exclusionListSize == 0 )
+            return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method.
+
+        // Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern:
+        // ("ID_1", "ID_2", ...)
+
+        final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 );
+        sb.append("(");
+        for ( int i=0; i < exclusionListSize; ++i ) {
+            sb.append("\"").append(excludedIDs.get(i)).append("\"");
+            if ( i < (exclusionListSize -1) )
+                sb.append(",");
+        }
+        sb.append(")");
+
+        excludedDatasourceIDsStringList = sb.toString();
+        //logger.debug("excludedDatasourceIDsStringList :\n" + excludedDatasourceIDsStringList);  // DEBUG!
     }
 
 
     public ResponseEntity<?> getAssignments(String workerId, int assignmentsLimit)
     {
         // Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
-
-        String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" +
-                "from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" +
-                    "from (\n" +
-                        "select  p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" +
-                            "from " + ImpalaConnector.databaseName + ".publication p\n" +
-                        "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
-                        "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
-                        "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
-                                "on attempts.id=p.id\n" +
-                        "left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
-                        "union all\n" +
-                        "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
-                                "on existing.id=p.id and existing.original_url=pu.url\n" +
-                            "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
-                            "\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
-                            "and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
-                        "limit " + (assignmentsLimit * 10) +
-                    ")\nas non_distinct_results\n" +
-                    "order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
-                    "limit " + assignmentsLimit +
-                "\n) as findAssignmentsQuery";
+        String findAssignmentsQuery =
+            "select pubid, url, datasourceid, datasourcename\n" +
+            "from (select distinct pubid, url, datasourceid, datasourcename, attempt_count\n" +
+            "   from (select  p.id as pubid, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
+            "       from " + ImpalaConnector.databaseName + ".publication p\n" +
+            "       join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
+            "       join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
+            "       left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
+            "              on attempts.id=p.id\n" +
+            "       left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
+            "       union all\n" +
+            "       select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
+            "              on existing.id=p.id and existing.original_url=pu.url\n" +
+            "          where d.allow_harvest=true and existing.id is null\n" +
+                    ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
+           ("          and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
+            "          and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
+            "          and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
+            "          and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
+            "       limit " + (assignmentsLimit * 10) + ")\n" +
+            "   as non_distinct_results\n" +
+            "   order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
+            "   limit " + assignmentsLimit + ")\n" +
+            "as findAssignmentsQuery";
 
         // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
         //logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index bd9d1ab..67a7032 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -29,6 +29,13 @@ services:
                 shouldEmptyBucket: false
                 shouldShowAllS3Buckets: true
 
+            datasources:    # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
+                excludedIDs: >  # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists.
+                    opendoar____::6f4922f45568161a8cdf4ad2299f6d23
+
+                # Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
+                # First-id: arXiv.org e-Print Archive
+
 spring:
     datasource:
         driver-class-name: com.cloudera.impala.jdbc41.Driver