Prioritize the full-text urls over the landing-page ones.

2024-01-15 12:59:50 +02:00 · 2024-01-15 12:59:50 +02:00 · 3a70b57146
parent ee1ca8966b
commit 3a70b57146
1 changed files with 6 additions and 1 deletions
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@ -67,6 +67,11 @@ public class UrlsServiceImpl implements UrlsService {

    private static String excludedDatasourceIDsStringList = null;

+
+    private static final String DOC_URL_FILTER = ".+(pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|viewfile|viewdoc|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*";
+    // "DOC_URL_FILTER" works for lowerCase Strings (we use the "ignore-case" indicator in the "regexp_like()" method).
+
+
    public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
    // TODO - Unify this ExecutorService with the hash-matching executorService. Since one will ALWAYS be called after the other. So why having two ExecServices to handle?

@ -135,7 +140,7 @@ public class UrlsServiceImpl implements UrlsService {
            "       and not exists (select 1 from " + DatabaseConnector.databaseName + ".attempt a where a.original_url=pu.url and a.error_class = 'noRetry' limit 1)\n" +
            "       and (p.year <= " + currentYear + " or p.year > " + (currentYear + 5) + ")\n" +  // Exclude the pubs which will be published in the next 5 years. They don't provide full-texts now. (We don't exclude all future pubs, since, some have invalid year, like "9999").
            ") as distinct_results\n" +
-            "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, reverse(pubid), url\n" +  // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
+            "order by coalesce(attempt_count, 0), coalesce(level, 0) desc, coalesce(pub_year, 0) desc, (case when regexp_like(url, '" + DOC_URL_FILTER + "', 'i') then 1 else 0 end) desc, reverse(pubid), url\n" +  // We also order by reverse "pubid" and "url", in order to get the exactly same records for consecutive runs, all things being equal.
            "limit " + assignmentsLimit;

        // The datasourceID and datasourceName are currently not used during the processing in the Worker. They may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one.