UrlsController/src/main/resources/application.yml

server:
    port: 1880
    servlet:
        context-path: /api

services:
    pdfaggregation:
        controller:
            isTestEnvironment: false
            # In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist).
            # The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist.
            # In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables:
            # "assignment", "attempt" and "payload", which are populated during execution.

            db:
                initialDatabaseName: pdfaggregation_i
                testDatabaseName: pdfaggregationdatabase_new_s3_names

            assignmentLimit: 10000
            maxAttemptsPerRecord: 3
            baseFilesLocation: tmp/
            parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/
            s3:
                endpoint: XA
                accessKey: XA
                secretKey: XA
                region: XA
                bucketName: XA
                shouldEmptyBucket: false
                shouldShowAllS3Buckets: true

            datasources:    # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
                excludedIDs: >  # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists (at least without additional config).
                    opendoar____::6f4922f45568161a8cdf4ad2299f6d23

                # Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
                # First-id: arXiv.org e-Print Archive

spring:
    application:
        name: Urls_Controller
    datasource:
        driver-class-name: com.cloudera.impala.jdbc41.Driver
        url: XA
        username: ''
        password: ''
        hikari:
            connectionTimeout: 30000
            idleTimeout: 600000
            maxLifetime: 1800000
            maximumPoolSize: 20
            minimumIdle: 4
            pool-name: ControllerPool
    output:
        ansi:
            enabled: always

hdfs:
    baseUrl: XA
    userName: XA
    password: XA
    httpAuth: ''
    # HTTP-Authorization -->   Authorization: Basic Base64Encode(username:password)
    # Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests)
    # Or by giving the username AND the password, in order for the program to crete the auth-String programmatically.
    # The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used.

    parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/   # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts.

# Prometheus related config.
management:
    endpoint:
        health:
            enabled: true
            show-details: always
        metrics:
            enabled: true
        prometheus:
            enabled: true
    endpoints:
        web:
            base-path: /actuator
            exposure:
                include: health,info,prometheus,metrics
    metrics:
        tags:
            application: ${spring.application.name}

logging:
    level:
        root: INFO
        eu:
            openaire:
                urls_controller: DEBUG
        org:
            springframework:
                security: WARN
                web: INFO
            apache:
                hadoop:
                    io:
                        compress: WARN