forked from lsmyrnaios/UrlsController
Upgrade the "findAssignmentsQuery":
- Retrieve the assignments by checking only the publication-urls against the "attempt", "assignment" and "payload" tables, not the IDs. This change allow us to: a) avoid re-attempting urls which have already been attempted multiple times (by different id-url pairs), b) avoid aggregating urls which are already inside the "payload" or "assignment" tables, even when they are related with other IDs. In the end, we only care about the urls when choosing which records should be aggregated. - Improve performance by using the "anti join" operator, where it fits, in order to allow the engine to use the faster "hash" operations.
This commit is contained in:
parent
c9626de120
commit
b702cf4484
|
@ -118,15 +118,15 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on.
|
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on.
|
||||||
" left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
|
" left outer join " + DatabaseConnector.databaseName + ".publication_boost pb\n" +
|
||||||
" on p.id=pb.id\n" +
|
" on p.id=pb.id\n" +
|
||||||
" left outer join (select count(a.id) as counts, a.id from " + DatabaseConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
" left outer join (select count(at.original_url) as counts, at.original_url from " + DatabaseConnector.databaseName + ".attempt at group by at.original_url) as attempts\n" +
|
||||||
" on attempts.id=p.id\n" +
|
" on attempts.original_url=pu.url\n" +
|
||||||
" left outer join (\n" +
|
" left anti join (\n" +
|
||||||
" select a.id, a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
|
" select a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
|
||||||
" union all\n" +
|
" union all\n" +
|
||||||
" select pl.id, pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
|
" select pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
|
||||||
" ) as existing\n" +
|
" ) as existing\n" +
|
||||||
" on existing.id=p.id and existing.original_url=pu.url\n" +
|
" on existing.original_url=pu.url\n" +
|
||||||
" where d.allow_harvest=true and existing.id is null\n" + // For records not found on existing, the "existing.id" will be null.
|
" where d.allow_harvest=true\n" +
|
||||||
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
|
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
|
||||||
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
|
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
|
||||||
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
|
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
|
||||||
|
|
Loading…
Reference in New Issue