Improve performance and reduce memory usage of the "findAssignmentsQuery":

- Reorder JOINs and predicates to reduce the computational cost. - Remove the memory-costly "pu.url" predicates from the "where" clause, as the DB has no empty urls anymore.
2023-10-31 15:59:48 +02:00 · 2023-10-31 15:59:48 +02:00 · 9b1f2c4931
parent db929d8931
commit 9b1f2c4931
1 changed files with 8 additions and 10 deletions
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@ -115,23 +115,21 @@ public class UrlsServiceImpl implements UrlsService {
            "select pubid, url, datasourceid, datasourcename\n" +   // Select the final sorted data with "assignmentsLimit".
            "from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" +   // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
            "   from " + DatabaseConnector.databaseName + ".publication p\n" +
+            "       join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid and d.allow_harvest=true"+
+                        ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
+                        (" and d.id not in " + excludedDatasourceIDsStringList + GenericUtils.endOfLine) : "") +
            "       join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
-            "       join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +  // This is needed for the "d.allow_harvest=true" check later on.
-            "       left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
-            "           on p.id=pb.id\n" +
-            "       left outer join (select count(at.original_url) as counts, at.original_url from " + DatabaseConnector.databaseName + ".attempt at group by at.original_url) as attempts\n" +
-            "           on attempts.original_url=pu.url\n" +
            "       left anti join (select a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
            "           union all\n" +
            "           select pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
            "           ) as existing\n" +
            "           on existing.original_url=pu.url\n" +
-            "   where d.allow_harvest=true\n" +
-                    ((excludedDatasourceIDsStringList != null) ?    // If we have an exclusion-list, use it below.
-           ("       and d.id not in " + excludedDatasourceIDsStringList + GenericUtils.endOfLine) : "") +
-            "       and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + GenericUtils.endOfLine +
+            "       left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
+            "           on p.id=pb.id\n" +
+            "       left outer join (select count(at.original_url) as counts, at.original_url from " + DatabaseConnector.databaseName + ".attempt at group by at.original_url) as attempts\n" +
+            "           on attempts.original_url=pu.url\n" +
+            "   where coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + GenericUtils.endOfLine +
            "       and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
-            "       and pu.url != '' and pu.url is not null\n" +   // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
            "       and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" +  // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
            ") as distinct_results\n" +
            "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" +  // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.