server: port: 1880 servlet: context-path: /api services: pdfaggregation: controller: isTestEnvironment: false # In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist). # The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist. # In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables: # "assignment", "attempt" and "payload", which are populated during execution. db: initialDatabaseName: pdfaggregation_i testDatabaseName: pdfaggregationdatabase_new_s3_names assignmentLimit: 10000 maxAttemptsPerRecord: 3 baseFilesLocation: tmp/ parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/ s3: endpoint: XA accessKey: XA secretKey: XA region: XA bucketName: XA shouldEmptyBucket: false shouldShowAllS3Buckets: true datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted. excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists (at least without additional config). opendoar____::6f4922f45568161a8cdf4ad2299f6d23 # Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values): # First-id: arXiv.org e-Print Archive spring: application: name: Urls_Controller datasource: driver-class-name: com.cloudera.impala.jdbc41.Driver url: XA username: '' password: '' hikari: connectionTimeout: 30000 idleTimeout: 600000 maxLifetime: 1800000 maximumPoolSize: 20 minimumIdle: 4 pool-name: ControllerPool output: ansi: enabled: always hdfs: baseUrl: XA userName: XA password: XA httpAuth: '' # HTTP-Authorization --> Authorization: Basic Base64Encode(username:password) # Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests) # Or by giving the username AND the password, in order for the program to crete the auth-String programmatically. # The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used. parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/ # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts. # Prometheus related config. management: endpoint: health: enabled: true show-details: always metrics: enabled: true prometheus: enabled: true endpoints: web: base-path: /actuator exposure: include: health,info,prometheus,metrics metrics: tags: application: ${spring.application.name} logging: level: root: INFO eu: openaire: urls_controller: DEBUG org: springframework: security: WARN web: INFO apache: hadoop: io: compress: WARN