Upgrade the "findAssignmentsQuery":

- Retrieve the assignments by checking only the publication-urls against the "attempt", "assignment" and "payload" tables, not the IDs. This change allow us to: a) avoid re-attempting urls which have already been attempted multiple times (by different id-url pairs), b) avoid aggregating urls which are already inside the "payload" or "assignment" tables, even when they are related with other IDs.
In the end, we only care about the urls when choosing which records should be aggregated.
- Improve performance by using the "anti join" operator, where it fits, in order to allow the engine to use the faster "hash" operations.
This commit is contained in:
Lampros Smyrnaios 2023-10-04 13:43:15 +03:00
parent c9626de120
commit b702cf4484
1 changed files with 8 additions and 8 deletions

View File

@ -118,15 +118,15 @@ public class UrlsServiceImpl implements UrlsService {
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on.
" left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
" on p.id=pb.id\n" +
" left outer join (select count(a.id) as counts, a.id from " + DatabaseConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
" on attempts.id=p.id\n" +
" left outer join (\n" +
" select a.id, a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
" left outer join (select count(at.original_url) as counts, at.original_url from " + DatabaseConnector.databaseName + ".attempt at group by at.original_url) as attempts\n" +
" on attempts.original_url=pu.url\n" +
" left anti join (\n" +
" select a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
" union all\n" +
" select pl.id, pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
" ) as existing\n" +
" on existing.id=p.id and existing.original_url=pu.url\n" +
" where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null.
" select pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
" ) as existing\n" +
" on existing.original_url=pu.url\n" +
" where d.allow_harvest=true\n" +
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +