Improve the "findAssignmentsQuery":
- Fix an issue, where assignments, having an above-zero attempt_count, were finding their way to the results, just because they were prioritized based on their boost_level or pub_year. Apart from retrying the old failed assignments sooner, the non-yet-processed boosted-publications were pushed out to the workers much slower. - Simplify the query, by removing the internal "ordering" and "limit", which had performance benefits when we did not need additional ordering for "level" and "pub_year". Back then, we wanted to apply the final orderings to as few rows as possible.
This commit is contained in:
parent
ee2df19ce1
commit
3dd349dd00
|
@ -111,8 +111,7 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
||||||
String findAssignmentsQuery =
|
String findAssignmentsQuery =
|
||||||
"select pubid, url, datasourceid, datasourcename\n" + // Select the final sorted data with "assignmentsLimit".
|
"select pubid, url, datasourceid, datasourcename\n" + // Select the final sorted data with "assignmentsLimit".
|
||||||
"from (select distinct pubid, url, datasourceid, datasourcename, level, pub_year, attempt_count\n" + // Select the distinct id-url data. Beware that this will return duplicate id-url paris, wince one pair may be associated with multiple datasources.
|
"from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" + // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
|
||||||
" from (select p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" + // Select all needed columns frm JOINs, order by "boost.level" and limit them to (assignmentsLimit * 10)
|
|
||||||
" from " + DatabaseConnector.databaseName + ".publication p\n" +
|
" from " + DatabaseConnector.databaseName + ".publication p\n" +
|
||||||
" join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
" join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||||
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on.
|
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + // This is needed for the "d.allow_harvest=true" check later on.
|
||||||
|
@ -133,12 +132,9 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
||||||
" and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
" and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
||||||
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
|
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
|
||||||
" order by coalesce(level, 0) desc, coalesce(pub_year, 0) desc\n" +
|
") as distinct_results\n" +
|
||||||
" limit " + (assignmentsLimit * 10) + "\n" +
|
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
||||||
" ) as non_distinct_results\n" +
|
"limit " + assignmentsLimit;
|
||||||
" order by coalesce(level, 0) desc, coalesce(pub_year, 0) desc, coalesce(attempt_count, 0), reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
|
||||||
" limit " + assignmentsLimit + "\n" +
|
|
||||||
") as findAssignmentsQuery";
|
|
||||||
|
|
||||||
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
|
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
|
||||||
// However, the "datasourceid" is useful to be able to generate the fileNames for the S3, without needing to perform additional select queries (with JOINs) at that phase.
|
// However, the "datasourceid" is useful to be able to generate the fileNames for the S3, without needing to perform additional select queries (with JOINs) at that phase.
|
||||||
|
|
Loading…
Reference in New Issue