UrlsController/src/main/resources/application.yml

server:
    port: 1880
    servlet:
        context-path: /api
    shutdown: graceful

services:
    pdfaggregation:
        controller:
            isTestEnvironment: true
            # In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist).
            # The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist.
            # In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables:
            # "assignment", "attempt" and "payload", which are populated during execution.

            db:
                initialDatabaseName: pdfaggregation_i
                testDatabaseName: pdfaggregationdatabase_test_new

            assignmentLimit: 10000
            maxAttemptsPerRecord: 3

            numOfBackgroundThreads: 8
            # This refers to the number of threads running in the background and processing workerReports and bulkImport procedures.

            baseFilesLocation: /tmp/
            workerReportsDirPath: /reports/workerReports/
            parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/
            s3:
                endpoint: XA
                accessKey: XA
                secretKey: XA
                region: XA
                bucketName: XA
                shouldEmptyBucket: false    # For data-security, in order to use this ability, the relevant code has to be uncommented in "util.S3ObjectStore.java"
                shouldShowAllS3Buckets: true
        worker:
            port: 1881

bulk-import:
    baseBulkImportLocation: /mnt/bulk_import/
    bulkImportReportLocation: /reports/bulkImportReports/
    numOfThreadsForBulkImportProcedures: 6
    bulkImportSources:  # These sources are accepted for bulk-import requests and are excluded from crawling.
        arxivImport:
            datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
            datasourcePrefix: arXiv_______  # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
            pdfUrlPrefix: https://arxiv.org/pdf/
            mimeType: application/pdf
            isAuthoritative: true
#        otherImport:
#            datasourceID: othersource__::0123
#            datasourcePrefix: other_______
#            pdfUrlPrefix: https://example.org/pdf/
#            mimeType: application/pdf
#            isAuthoritative: false

# For "authoritative" sources, a special prefix is selected, from: https://graph.openaire.eu/docs/data-model/pids-and-identifiers/#identifiers-in-the-graph
# For the rest, the "datasource_prefix" is selected, using this query:
#   select datasource.namespaceprefix.value
#   from openaire_prod_<PROD_DATE>.datasource      -- Here use the production-table with the latest date.
#   where officialname.value = 'datasourceOfficialName';


spring:
    application:
        name: Urls_Controller
    datasource:
        driver-class-name: com.cloudera.impala.jdbc41.Driver
        url: XA
        username: ''
        password: ''
        hikari:
            connectionTimeout: 30000
            idleTimeout: 600000
            maxLifetime: 1800000
            maximumPoolSize: 20
            minimumIdle: 4
            pool-name: ControllerPool
    output:
        ansi:
            enabled: always
    lifecycle:
        timeout-per-shutdown-phase: 5m


hdfs:
    baseUrl: XA
    userName: XA
    password: XA
    httpAuth: ''
    # HTTP-Authorization -->   Authorization: Basic Base64Encode(username:password)
    # Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests)
    # Or by giving the username AND the password, in order for the program to crete the auth-String programmatically.
    # The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used.

    parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/   # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts.

# Prometheus related config.
management:
    endpoint:
        health:
            enabled: true
            show-details: always
        metrics:
            enabled: true
        prometheus:
            enabled: true
    endpoints:
        web:
            base-path: /actuator
            exposure:
                include: health,info,prometheus,metrics
    metrics:
        tags:
            application: ${spring.application.name}

logging:
    file:
        path: logs
    level:
        root: INFO
        eu:
            openaire:
                urls_controller: DEBUG
        org:
            springframework:
                security: WARN
                web: INFO
            apache:
                hadoop:
                    io:
                        compress: WARN