From b702cf44842f1a9abb46b0282e0356a518c7348b Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 4 Oct 2023 13:43:15 +0300 Subject: [PATCH] Upgrade the "findAssignmentsQuery": - Retrieve the assignments by checking only the publication-urls against the "attempt", "assignment" and "payload" tables, not the IDs. This change allow us to: a) avoid re-attempting urls which have already been attempted multiple times (by different id-url pairs), b) avoid aggregating urls which are already inside the "payload" or "assignment" tables, even when they are related with other IDs. In the end, we only care about the urls when choosing which records should be aggregated. - Improve performance by using the "anti join" operator, where it fits, in order to allow the engine to use the faster "hash" operations. --- .../services/UrlsServiceImpl.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index 3a4edce..2120ee2 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -118,15 +118,15 @@ public class UrlsServiceImpl implements UrlsService { " join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on. " left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" + " on p.id=pb.id\n" + - " left outer join (select count(a.id) as counts, a.id from " + DatabaseConnector.databaseName + ".attempt a group by a.id) as attempts\n" + - " on attempts.id=p.id\n" + - " left outer join (\n" + - " select a.id, a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" + + " left outer join (select count(at.original_url) as counts, at.original_url from " + DatabaseConnector.databaseName + ".attempt at group by at.original_url) as attempts\n" + + " on attempts.original_url=pu.url\n" + + " left anti join (\n" + + " select a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" + " union all\n" + - " select pl.id, pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables. - " ) as existing\n" + - " on existing.id=p.id and existing.original_url=pu.url\n" + - " where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null. + " select pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables. + " ) as existing\n" + + " on existing.original_url=pu.url\n" + + " where d.allow_harvest=true\n" + ((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below. (" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") + " and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +