From a89abe3f2f68a76e48d4fe52e0ab614eded0a0a5 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 29 Jun 2023 12:32:06 +0300
Subject: [PATCH] Prioritize the publications, which are specified inside the
 "publication_boost" table, according to their "boost-level".

---
 .../configuration/ImpalaConnector.java               |  5 ++---
 .../urls_controller/services/UrlsServiceImpl.java    | 12 +++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java
index 3c13be0..47837fe 100644
--- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java
+++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java
@@ -67,9 +67,8 @@ public class ImpalaConnector {
             jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_urls stored as parquet as select * from " + initialDatabaseName + ".publication_urls");
             jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_urls");
 
-
-            // TODO - Should we add the "publication_oids" table? Will it be used in the "getAssignments" query?
-
+            jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_boost stored as parquet as select * from " + initialDatabaseName + ".publication_boost");
+            jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_boost");
 
             jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".datasource stored as parquet as select * from " + initialDatabaseName + ".datasource");
             jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".datasource");
diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
index 0f05a73..be0bdbb 100644
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@@ -111,31 +111,33 @@ public class UrlsServiceImpl implements UrlsService {
         String findAssignmentsQuery =
             "select pubid, url, datasourceid, datasourcename\n" +   // The datsourceName is currently not used. It may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
             "from (select distinct pubid, url, datasourceid, datasourcename, attempt_count, pub_year\n" +
-            "   from (select  p.id as pubid, p.year as pub_year, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
+            "   from (select  p.id as pubid, p.year as pub_year, pu.url as url, pb.level as level, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
             "       from " + ImpalaConnector.databaseName + ".publication p\n" +
             "       join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
             "       join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
+            "       left outer join " + ImpalaConnector.databaseName + ".publication_boost pb\n" +
+            "               on p.id=pb.id\n" +
             "       left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
-            "              on attempts.id=p.id\n" +
+            "               on attempts.id=p.id\n" +
             "       left outer join (\n" +
             "           select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
             "           union all\n" +
             "           select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
             "           ) as existing\n" +
-            "              on existing.id=p.id and existing.original_url=pu.url\n" +
+            "               on existing.id=p.id and existing.original_url=pu.url\n" +
             "           where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null.
                     ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
            ("               and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
             "               and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
             "               and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
             "               and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
+            "           order by coalesce(level, -1000) desc\n" +
             "           limit " + (assignmentsLimit * 10) + ")\n" +
             "   as non_distinct_results\n" +
-            "   order by coalesce(attempt_count, 0), coalesce(pub_year, 0) desc, reverse(pubid), url\n" +
+            "   order by coalesce(attempt_count, 0), coalesce(pub_year, 0) desc, reverse(pubid), url\n" +  // We also order by "id" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
             "   limit " + assignmentsLimit + ")\n" +
             "as findAssignmentsQuery";
 
-
         // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
         //logger.trace("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!