From 3dd349dd0054d8e9570e50b73b5c6b483a729ec0 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Wed, 13 Sep 2023 14:38:15 +0300
Subject: [PATCH] Improve the "findAssignmentsQuery": - Fix an issue, where
 assignments, having an above-zero attempt_count, were finding their way to
 the results, just because they were prioritized based on their boost_level or
 pub_year. Apart from retrying the old failed assignments sooner, the
 non-yet-processed boosted-publications were pushed out to the workers much
 slower. - Simplify the query, by removing the internal "ordering" and
 "limit", which had performance benefits when we did not need additional
 ordering for "level" and "pub_year". Back then, we wanted to apply the final
 orderings to as few rows as possible.

---
 .../services/UrlsServiceImpl.java             | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
index a81ee04..a49f906 100644
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@@ -111,34 +111,30 @@ public class UrlsServiceImpl implements UrlsService {
         // Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
         String findAssignmentsQuery =
             "select pubid, url, datasourceid, datasourcename\n" +   // Select the final sorted data with "assignmentsLimit".
-            "from (select distinct pubid, url, datasourceid, datasourcename, level, pub_year, attempt_count\n" +  // Select the distinct id-url data. Beware that this will return duplicate id-url paris, wince one pair may be associated with multiple datasources.
-            "   from (select p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" +  // Select all needed columns frm JOINs, order by "boost.level" and limit them to  (assignmentsLimit * 10)
-            "       from " + DatabaseConnector.databaseName + ".publication p\n" +
-            "           join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
-            "           join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +  // This is needed for the "d.allow_harvest=true" check later on.
-            "           left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
-            "                   on p.id=pb.id\n" +
-            "           left outer join (select count(a.id) as counts, a.id from " + DatabaseConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
-            "                   on attempts.id=p.id\n" +
-            "           left outer join (\n" +
-            "               select a.id, a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
-            "               union all\n" +
-            "               select pl.id, pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
-            "               ) as existing\n" +
-            "                   on existing.id=p.id and existing.original_url=pu.url\n" +
-            "       where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null.
-                        ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
-           ("           and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
-            "           and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
-            "           and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
-            "           and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
-            "           and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" +  // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
-            "       order by coalesce(level, 0) desc, coalesce(pub_year, 0) desc\n" +
-            "       limit " + (assignmentsLimit * 10) + "\n" +
-            "   ) as non_distinct_results\n" +
-            "   order by coalesce(level, 0) desc, coalesce(pub_year, 0) desc, coalesce(attempt_count, 0), reverse(pubid), url\n" +  // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
-            "   limit " + assignmentsLimit + "\n" +
-            ") as findAssignmentsQuery";
+            "from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" +   // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
+            "   from " + DatabaseConnector.databaseName + ".publication p\n" +
+            "       join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
+            "       join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +  // This is needed for the "d.allow_harvest=true" check later on.
+            "       left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
+            "           on p.id=pb.id\n" +
+            "       left outer join (select count(a.id) as counts, a.id from " + DatabaseConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
+            "           on attempts.id=p.id\n" +
+            "       left outer join (\n" +
+            "           select a.id, a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
+            "           union all\n" +
+            "           select pl.id, pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
+            "           ) as existing\n" +
+            "               on existing.id=p.id and existing.original_url=pu.url\n" +
+            "   where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null.
+                    ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
+           ("       and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
+            "       and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
+            "       and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
+            "       and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
+            "       and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" +  // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
+            ") as distinct_results\n" +
+            "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" +  // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
+            "limit " + assignmentsLimit;
 
         // The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
         // However, the "datasourceid" is useful to be able to generate the fileNames for the S3, without needing to perform additional select queries (with JOINs) at that phase.