Improve speed of fulltext-collection by using a ranking system to prioritize Open and Unknown access publications, over Restricted, Embargoed and Closed access ones.
This commit is contained in:
parent
63cf63e6cc
commit
e46743bfba
|
@ -119,8 +119,8 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
|
|
||||||
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
||||||
String findAssignmentsQuery =
|
String findAssignmentsQuery =
|
||||||
"select pubid, url, datasourceid, datasourcename\n" + // Select the final sorted data with "assignmentsLimit".
|
"select pubid, url, datasourceid, datasourcename, accessmode\n" + // Select the final sorted data with "assignmentsLimit".
|
||||||
"from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" + // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
|
"from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename, p.accessmode\n" + // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
|
||||||
" from " + DatabaseConnector.databaseName + ".publication_urls pu\n" +
|
" from " + DatabaseConnector.databaseName + ".publication_urls pu\n" +
|
||||||
" join " + DatabaseConnector.databaseName + ".publication p on p.id=pu.id\n" +
|
" join " + DatabaseConnector.databaseName + ".publication p on p.id=pu.id\n" +
|
||||||
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid and d.allow_harvest=true"+
|
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid and d.allow_harvest=true"+
|
||||||
|
@ -139,7 +139,10 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
|
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
|
||||||
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
|
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
|
||||||
") as distinct_results\n" +
|
") as distinct_results\n" +
|
||||||
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc,\n" +
|
||||||
|
" (case when accessmode='OPEN' then 5 when accessmode='UNKNOWN' then 4 when accessmode='RESTRICTED' then 3 when accessmode='EMBARGOED' then 2 when accessmode='CLOSED' then 1 else 4 end) desc,\n" +
|
||||||
|
" coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc,\n" +
|
||||||
|
" reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
||||||
"limit " + assignmentsLimit;
|
"limit " + assignmentsLimit;
|
||||||
|
|
||||||
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
|
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
|
||||||
|
@ -179,6 +182,7 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
assignment.setOriginalUrl(rs.getString(2));
|
assignment.setOriginalUrl(rs.getString(2));
|
||||||
datasource.setId(rs.getString(3));
|
datasource.setId(rs.getString(3));
|
||||||
datasource.setName(rs.getString(4));
|
datasource.setName(rs.getString(4));
|
||||||
|
// The 5th column is the "accessmode" which we do not need after the prioritization takes place in the query.
|
||||||
} catch (SQLException sqle) {
|
} catch (SQLException sqle) {
|
||||||
logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
|
logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue