From a1c16ffc19ebdf2cca2ab69da12dbeb5fd21c343 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 16 Feb 2023 14:24:47 +0200 Subject: [PATCH] - Exclude empty and null urls in the assignments. - Update the "getFullTextsImproved"-call to "getFullTexts", now that the "improved" version is stable. - Update Gradle. - Code polishing. --- README.md | 2 +- gradle/wrapper/gradle-wrapper.properties | 2 +- installAndRun.sh | 2 +- .../controllers/UrlController.java | 20 ++++++++++--------- .../services/StatsService.java | 4 ++++ .../urls_controller/util/FileUtils.java | 2 +- 6 files changed, 19 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 75ee072..f23dc9a 100644 --- a/README.md +++ b/README.md @@ -25,4 +25,4 @@ If you want to build and run the app on a **Docker Container**, then run the scr Implementation notes: - For transferring the full-text files, we use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed. -- The names of the uploaded full-text files ae of the following form: "***datasourceID/recordId::fileHash.pdf***" +- The uploaded full-text files follow this naming-scheme: "**datasourceID/recordId::fileHash.pdf**" diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index f398c33..42defcc 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-bin.zip networkTimeout=10000 zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 69a45aa..0c00435 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -26,7 +26,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then justInstall=0 fi -gradleVersion="7.6" +gradleVersion="8.0" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 5461cab..e415d50 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -91,24 +91,26 @@ public class UrlController { "from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" + "from (\n" + "select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" + - "from " + ImpalaConnector.databaseName + ".publication p\n" + + "from " + ImpalaConnector.databaseName + ".publication p\n" + "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + - "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts on attempts.id=p.id\n" + + "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + + "on attempts.id=p.id\n" + "left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + "union all\n" + - "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl)\n" + - "as existing on existing.id=p.id and existing.original_url=pu.url\n" + + "select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + + "on existing.id=p.id and existing.original_url=pu.url\n" + "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + - "limit " + (assignmentsLimit * 10) + ")\n" + - "as non_distinct_results\n" + + "and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. + "limit " + (assignmentsLimit * 10) + + ")\nas non_distinct_results\n" + "order by coalesce(attempt_count, 0), reverse(pubid), url\n" + - "limit " + assignmentsLimit + ")\n" + - "as findAssignmentsQuery"; + "limit " + assignmentsLimit + + "\n) as findAssignmentsQuery"; // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. - //logger.debug(findAssignmentsQuery); // DEBUG! + //logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG! String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment"; diff --git a/src/main/java/eu/openaire/urls_controller/services/StatsService.java b/src/main/java/eu/openaire/urls_controller/services/StatsService.java index 6cb0f64..16e1bf9 100644 --- a/src/main/java/eu/openaire/urls_controller/services/StatsService.java +++ b/src/main/java/eu/openaire/urls_controller/services/StatsService.java @@ -67,4 +67,8 @@ public class StatsService { } } + // To get the human-friendly timestamp format from the BigInt in the database: + // select from_timestamp(CAST(CAST(`date` as decimal(30,0))/1000 AS timestamp), "yyyy-MM-dd HH:mm:ss.SSS") from payload + + } diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 5b58bfe..8174ca3 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -237,7 +237,7 @@ public class FileUtils { logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches (" + numOfFullTextsPerBatch + " files each)."); // Check if one full text is left out because of the division. Put it int the last batch. - String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTextsImproved/" + assignmentsBatchCounter + "/" + numOfBatches + "/"; + String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/"; // TODO - The worker should send the port in which it accepts requests, along with the current request. // TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.