Prioritize the full-text urls over the landing-page ones.
This commit is contained in:
parent
ee1ca8966b
commit
3a70b57146
|
@ -67,6 +67,11 @@ public class UrlsServiceImpl implements UrlsService {
|
|||
|
||||
private static String excludedDatasourceIDsStringList = null;
|
||||
|
||||
|
||||
private static final String DOC_URL_FILTER = ".+(pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|viewfile|viewdoc|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*";
|
||||
// "DOC_URL_FILTER" works for lowerCase Strings (we use the "ignore-case" indicator in the "regexp_like()" method).
|
||||
|
||||
|
||||
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
|
||||
// TODO - Unify this ExecutorService with the hash-matching executorService. Since one will ALWAYS be called after the other. So why having two ExecServices to handle?
|
||||
|
||||
|
@ -135,7 +140,7 @@ public class UrlsServiceImpl implements UrlsService {
|
|||
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
|
||||
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
|
||||
") as distinct_results\n" +
|
||||
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
||||
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
|
||||
"limit " + assignmentsLimit;
|
||||
|
||||
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.
|
||||
|
|
Loading…
Reference in New Issue