From 34d7a143e73f4e60f3e8c9c6af2a8d1259f38063 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 1 Feb 2024 14:37:29 +0200 Subject: [PATCH] Add/improve documentation. --- README.md | 28 ++++++++++++------- .../urls_controller/util/FileUtils.java | 2 +- src/main/resources/application.yml | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d4fae1b..b88d02c 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,18 @@ It can also process **Bulk-Import** requests, from compatible data sources, in w For interacting with the database we use [**Impala**](https://impala.apache.org/).

+ +**To install and run the application**: +- Run ```git clone``` and then ```cd UrlsController```. +- Set the preferable values inside the [__application.yml__](https://code-repo.d4science.org/lsmyrnaios/UrlsController/src/branch/master/src/main/resources/application.yml) file. Specifically, for tests, set the ***services.pdfaggregation.controller.isTestEnvironment*** property to "**true**" and make sure the "***services.pdfaggregation.controller.db.testDatabaseName***" property is set to a test-database. +- Execute the ```installAndRun.sh``` script which builds and runs the app.
+ If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```.
+ If you want to build and run the app on a **Docker Container**, then run the script with the argument "0" followed by the argument "1": ```./installAndRun.sh 0 1```.
+ Additionally, if you want to test/visualize the exposed metrics on Prometheus and Grafana, you can deploy their instances on docker containers, + by enabling the "runPrometheusAndGrafanaContainers" switch, inside the "./installAndRun.sh" script.
+
+ + **BulkImport API**: - "**bulkImportFullTexts**" endpoint: **http://\:\/api/bulkImportFullTexts?provenance=\&bulkImportDir=\&shouldDeleteFilesOnFinish={true|false}**
This endpoint loads the right configuration with the help of the "provenance" parameter, delegates the processing to a background thread and immediately returns a message with useful information, including the "reportFileID", which can be used at any moment to request a report about the progress of the bulk-import procedure.
@@ -15,6 +27,12 @@ For interacting with the database we use [**Impala**](https://impala.apache.org/ - "**getBulkImportReport**" endpoint: **http://\:\/api/getBulkImportReport?id=\**
This endpoint returns the bulkImport report, which corresponds to the given reportFileID, in JSON format.
+ +**How to add a bulk-import datasource**: +- Open the [__application.yml__](https://code-repo.d4science.org/lsmyrnaios/UrlsController/src/branch/master/src/main/resources/application.yml) file. +- Add a new object under the "bulk-import.bulkImportSources" property. +- Read the comments written in the end of the "bulk-import" property and make sure all requirements are met. +

**Statistics API**: @@ -60,16 +78,6 @@ Note: The Shutdown Service API is accessible by the Controller's host machine.

-**To install and run the application**: -- Run ```git clone``` and then ```cd UrlsController```. -- Set the preferable values inside the [__application.yml__](https://code-repo.d4science.org/lsmyrnaios/UrlsController/src/branch/master/src/main/resources/application.yml) file. -- Execute the ```installAndRun.sh``` script which builds and runs the app.
-If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```.
-If you want to build and run the app on a **Docker Container**, then run the script with the argument "0" followed by the argument "1": ```./installAndRun.sh 0 1```.
-Additionally, if you want to test/visualize the exposed metrics on Prometheus and Grafana, you can deploy their instances on docker containers, -by enabling the "runPrometheusAndGrafanaContainers" switch, inside the "./installAndRun.sh" script.
-
- Implementation notes: - For transferring the full-text files, we use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed. - The uploaded full-text files follow this naming-scheme: "**datasourceID/recordID::fileHash.pdf**" diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index da1b3fe..201a3f5 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -509,7 +509,7 @@ public class FileUtils { { // Iterate over the files and upload them to S3. //int numUploadedFiles = 0; - for( String fileName : fileNames ) + for ( String fileName : fileNames ) { if ( fileName.contains(".tar") ) // Exclude the tar-files from uploading (".tar" and ".tar.zstd"). continue; diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 694fb5f..1e31419 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -58,7 +58,7 @@ bulk-import: # For "authoritative" sources, a special prefix is selected, from: https://graph.openaire.eu/docs/data-model/pids-and-identifiers/#identifiers-in-the-graph # For the rest, the "datasource_prefix" is selected, using this query: # select datasource.namespaceprefix.value -# from openaire_prod_20230414.datasource -- Here use the latest production-table. +# from openaire_prod_.datasource -- Here use the production-table with the latest date. # where officialname.value = 'datasourceOfficialName';