From 3a70b57146781f26afb51912dc0325588f1541d3 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 15 Jan 2024 12:59:50 +0200 Subject: [PATCH] Prioritize the full-text urls over the landing-page ones. --- .../openaire/urls_controller/services/UrlsServiceImpl.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index 6b5eaf3..48106cc 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -67,6 +67,11 @@ public class UrlsServiceImpl implements UrlsService { private static String excludedDatasourceIDsStringList = null; + + private static final String DOC_URL_FILTER = ".+(pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|viewfile|viewdoc|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*"; + // "DOC_URL_FILTER" works for lowerCase Strings (we use the "ignore-case" indicator in the "regexp_like()" method). + + public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6); // TODO - Unify this ExecutorService with the hash-matching executorService. Since one will ALWAYS be called after the other. So why having two ExecServices to handle? @@ -135,7 +140,7 @@ public class UrlsServiceImpl implements UrlsService { " and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" + " and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" + // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999"). ") as distinct_results\n" + - "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal. + "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc, reverse(pubid), url\n" + // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal. "limit " + assignmentsLimit; // The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.