From e8644cb64f81e5f8291ace0492494aee26473583 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 5 Jul 2023 17:10:30 +0300 Subject: [PATCH] - Optimize the "insertAssignmentsQuery". - Add documentation about the Prometheus Metrics, in README. - Update Dependencies. - Code polishing. --- README.md | 15 +++++++++++++- build.gradle | 20 ++++++++++--------- gradle/wrapper/gradle-wrapper.properties | 2 +- installAndRun.sh | 2 +- .../components/ScheduledTasks.java | 4 ++-- .../controllers/UrlsController.java | 2 +- .../services/UrlsServiceImpl.java | 6 +++--- .../urls_controller/util/FileUtils.java | 2 +- 8 files changed, 34 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index da1c121..d4fae1b 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ For interacting with the database we use [**Impala**](https://impala.apache.org/ - "**getNumberOfPayloadsAggregatedByService**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsAggregatedByService**
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, both through crawling and bulk-import procedures. - "**getNumberOfLegacyPayloads**" endpoint: **http://\:\/api/stats/getNumberOfLegacyPayloads**
- This endpoint returns the number of payloads which were aggregated by methods other thant the PDF Aggregation Service. + This endpoint returns the number of payloads which were aggregated by methods other than the PDF Aggregation Service. - "**getNumberOfPayloadsForDatasource**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsForDatasource?datasourceId=\**
This endpoint returns the number of payloads which belong to the datasource specified by the given datasourceID. - "**getNumberOfRecordsInspectedByServiceThroughCrawling**" endpoint: **http://\:\/api/stats/getNumberOfRecordsInspectedByServiceThroughCrawling**
@@ -47,6 +47,19 @@ Note: The Shutdown Service API is accessible by the Controller's host machine.

+ +**Prometheus Metrics**: +- "**numOfAllPayloads**" +- "**numOfPayloadsAggregatedByServiceThroughCrawling**" +- "**numOfPayloadsAggregatedByServiceThroughBulkImport**" +- "**numOfPayloadsAggregatedByService**" +- "**numOfLegacyPayloads**" +- "**numOfRecordsInspectedByServiceThroughCrawling**" +- "**getAssignments_time_seconds_max**": Time taken to return the assignments. +- "**addWorkerReport_time_seconds**": Time taken to add the WorkerReport. +
+
+ **To install and run the application**: - Run ```git clone``` and then ```cd UrlsController```. - Set the preferable values inside the [__application.yml__](https://code-repo.d4science.org/lsmyrnaios/UrlsController/src/branch/master/src/main/resources/application.yml) file. diff --git a/build.gradle b/build.gradle index 7fcad05..0480e52 100644 --- a/build.gradle +++ b/build.gradle @@ -1,12 +1,14 @@ plugins { - id 'org.springframework.boot' version '2.7.12' + id 'org.springframework.boot' version '2.7.13' id 'io.spring.dependency-management' version '1.1.0' id 'java' } -group = 'eu.openaire.urls_controller' -version = '2.1.0-SNAPSHOT' -sourceCompatibility = '1.8' +java { + group = 'eu.openaire.urls_controller' + version = '2.1.0-SNAPSHOT' + sourceCompatibility = '1.8' +} repositories { mavenCentral() @@ -41,7 +43,7 @@ dependencies { //implementation group: 'jakarta.validation', name: 'jakarta.validation-api', version: '3.0.2' // https://mvnrepository.com/artifact/com.google.guava/guava - implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre' + implementation group: 'com.google.guava', name: 'guava', version: '32.1.1-jre' // https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.12.0' @@ -49,7 +51,7 @@ dependencies { // https://mvnrepository.com/artifact/org.apache.commons/commons-compress implementation 'org.apache.commons:commons-compress:1.23.0' - implementation 'io.minio:minio:8.5.3' + implementation 'io.minio:minio:8.5.4' // https://mvnrepository.com/artifact/com.cloudera.impala/jdbc implementation("com.cloudera.impala:jdbc:2.5.31") { @@ -75,7 +77,7 @@ dependencies { implementation('org.apache.parquet:parquet-avro:1.13.1') // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common - implementation('org.apache.hadoop:hadoop-common:3.3.5') { + implementation('org.apache.hadoop:hadoop-common:3.3.6') { exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-api' @@ -91,7 +93,7 @@ dependencies { } // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core - implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.5') { + implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') { exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-api' @@ -108,7 +110,7 @@ dependencies { implementation 'com.fasterxml.woodstox:woodstox-core:6.5.1' // https://mvnrepository.com/artifact/org.json/json - implementation 'org.json:json:20230227' + implementation 'org.json:json:20230618' // https://mvnrepository.com/artifact/com.google.code.gson/gson implementation 'com.google.code.gson:gson:2.10.1' diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 37aef8d..a363877 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.1.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip networkTimeout=10000 zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 7d17ff4..5431630 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -26,7 +26,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then justInstall=0 fi -gradleVersion="8.1.1" +gradleVersion="8.2" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java index fdbfb76..f15b90e 100644 --- a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java +++ b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java @@ -55,7 +55,6 @@ public class ScheduledTasks { workerReportsDirPath += "/"; this.workerReportsDirPath = workerReportsDirPath; // This dir will be created later. - this.statsController = statsController; registry.gauge("numOfAllPayloads", numOfAllPayloads); @@ -180,7 +179,8 @@ public class ScheduledTasks { // Prometheus scrapes for metrics usually every 15 seconds, but that is an extremely short time-period for DB-statistics. @Scheduled(fixedDelay = 21_600_000) // Every 6 hours run thw following queries to the database and register the metric. - //@Scheduled(initialDelay = 60_000, fixedDelay = 120_000) // For testing only. + @Scheduled(initialDelay = 60_000, fixedDelay = 1_200_000) // For general testing only. + //@Scheduled(initialDelay = 60_000, fixedDelay = 120_000) // For debug testing only. public void updatePrometheusMetrics() { ResponseEntity responseEntity = statsController.getNumberOfAllPayloads(true); diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlsController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlsController.java index 3147b25..1f081a6 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlsController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlsController.java @@ -162,7 +162,7 @@ public class UrlsController { try { Files.createDirectories(currentWorkerReportLocationDir); // No-op if dir exists. It does not throw a "alreadyExistsException" } catch (Exception e) { - String errorMsg = "Could nor create the \"currentWorkerReportLocationDir\" for worker \"" + curWorkerId + "\" : " + currentWorkerReportLocationDir; + String errorMsg = "Could not create the \"currentWorkerReportLocationDir\" for worker \"" + curWorkerId + "\" : " + currentWorkerReportLocationDir; logger.error(errorMsg, e); return ResponseEntity.internalServerError().body(errorMsg); } diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index be0bdbb..2e86409 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -202,13 +202,13 @@ public class UrlsServiceImpl implements UrlsService { } else return ResponseEntity.status(HttpStatus.MULTI_STATUS).body(new AssignmentsResponse((long) -1, null)); } else if ( assignmentsSize < assignmentsLimit ) - logger.warn("The retrieved results were fewer (" + assignmentsSize + ") than the \"assignmentsLimit\" (" + assignmentsLimit + "), for worker with id: " + workerId + ". Will increase the \"maxAttempts\" to " + maxAttemptsPerRecordAtomic.incrementAndGet() + " for the next requests."); + logger.warn("The retrieved results were fewer (" + assignmentsSize + ") than the \"assignmentsLimit\" (" + assignmentsLimit + "), for worker with id: " + workerId + ". Will increase the \"maxAttempts\" to " + maxAttemptsPerRecordAtomic.incrementAndGet() + ", for the next requests."); logger.debug("Finished gathering " + assignmentsSize + " assignments for worker with id \"" + workerId + "\". Going to insert them into the \"assignment\" table and then return them to the worker."); // Write the Assignment details to the assignment-table. - String insertAssignmentsQuery = "insert into " + ImpalaConnector.databaseName + ".assignment \n select pub_data.pubid, pub_data.url, '" + workerId + "', " + timestampMillis + "\n" - + "from (\n select pubid, url from " + ImpalaConnector.databaseName + ".current_assignment) as pub_data"; + String insertAssignmentsQuery = "insert into " + ImpalaConnector.databaseName + ".assignment \n select pubid, url, '" + workerId + "', " + timestampMillis + + "\nfrom " + ImpalaConnector.databaseName + ".current_assignment"; try { jdbcTemplate.execute(insertAssignmentsQuery); diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 6e4e594..8ca85c6 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -703,7 +703,7 @@ public class FileUtils { } - Lock fileWriteLock = new ReentrantLock(true); + private static final Lock fileWriteLock = new ReentrantLock(true); public String writeToFile(String fileFullPath, String stringToWrite, boolean shouldLockThreads) {