103 lines
3.9 KiB
YAML
103 lines
3.9 KiB
YAML
server:
|
|
port: 1880
|
|
servlet:
|
|
context-path: /api
|
|
|
|
services:
|
|
pdfaggregation:
|
|
controller:
|
|
isTestEnvironment: false
|
|
# In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist).
|
|
# The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist.
|
|
# In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables:
|
|
# "assignment", "attempt" and "payload", which are populated during execution.
|
|
|
|
db:
|
|
initialDatabaseName: pdfaggregation_i
|
|
testDatabaseName: pdfaggregationdatabase_new_s3_names
|
|
|
|
assignmentLimit: 10000
|
|
maxAttemptsPerRecord: 3
|
|
baseFilesLocation: tmp/
|
|
parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/
|
|
s3:
|
|
endpoint: XA
|
|
accessKey: XA
|
|
secretKey: XA
|
|
region: XA
|
|
bucketName: XA
|
|
shouldEmptyBucket: false
|
|
shouldShowAllS3Buckets: true
|
|
|
|
datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
|
|
excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists (at least without additional config).
|
|
opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
|
|
|
# Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
|
|
# First-id: arXiv.org e-Print Archive
|
|
|
|
spring:
|
|
application:
|
|
name: Urls_Controller
|
|
datasource:
|
|
driver-class-name: com.cloudera.impala.jdbc41.Driver
|
|
url: XA
|
|
username: ''
|
|
password: ''
|
|
hikari:
|
|
connectionTimeout: 30000
|
|
idleTimeout: 600000
|
|
maxLifetime: 1800000
|
|
maximumPoolSize: 20
|
|
minimumIdle: 4
|
|
pool-name: ControllerPool
|
|
output:
|
|
ansi:
|
|
enabled: always
|
|
|
|
hdfs:
|
|
baseUrl: XA
|
|
userName: XA
|
|
password: XA
|
|
httpAuth: ''
|
|
# HTTP-Authorization --> Authorization: Basic Base64Encode(username:password)
|
|
# Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests)
|
|
# Or by giving the username AND the password, in order for the program to crete the auth-String programmatically.
|
|
# The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used.
|
|
|
|
parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/ # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts.
|
|
|
|
# Prometheus related config.
|
|
management:
|
|
endpoint:
|
|
health:
|
|
enabled: true
|
|
show-details: always
|
|
metrics:
|
|
enabled: true
|
|
prometheus:
|
|
enabled: true
|
|
endpoints:
|
|
web:
|
|
base-path: /actuator
|
|
exposure:
|
|
include: health,info,prometheus,metrics
|
|
metrics:
|
|
tags:
|
|
application: ${spring.application.name}
|
|
|
|
logging:
|
|
level:
|
|
root: INFO
|
|
eu:
|
|
openaire:
|
|
urls_controller: DEBUG
|
|
org:
|
|
springframework:
|
|
security: WARN
|
|
web: INFO
|
|
apache:
|
|
hadoop:
|
|
io:
|
|
compress: WARN
|