Prioritize the full-text urls over the landing-page ones.

This commit is contained in:
Lampros Smyrnaios 2024-01-15 12:59:50 +02:00
parent ee1ca8966b
commit 3a70b57146
1 changed files with 6 additions and 1 deletions

View File

@ -67,6 +67,11 @@ public class UrlsServiceImpl implements UrlsService {
private static String excludedDatasourceIDsStringList = null;
private static final String DOC_URL_FILTER = ".+(pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|viewfile|viewdoc|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*";
// "DOC_URL_FILTER" works for lowerCase Strings (we use the "ignore-case" indicator in the "regexp_like()" method).
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
// TODO - Unify this ExecutorService with the hash-matching executorService. Since one will ALWAYS be called after the other. So why having two ExecServices to handle?
@ -135,7 +140,7 @@ public class UrlsServiceImpl implements UrlsService {
" and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
" and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
") as distinct_results\n" +
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
"order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
"limit " + assignmentsLimit;
// The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.