118 lines
4.2 KiB
YAML
118 lines
4.2 KiB
YAML
server:
|
|
port: 1880
|
|
servlet:
|
|
context-path: /api
|
|
shutdown: graceful
|
|
|
|
services:
|
|
pdfaggregation:
|
|
controller:
|
|
isTestEnvironment: true
|
|
# In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist).
|
|
# The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist.
|
|
# In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables:
|
|
# "assignment", "attempt" and "payload", which are populated during execution.
|
|
|
|
db:
|
|
initialDatabaseName: pdfaggregation_i
|
|
testDatabaseName: pdfaggregationdatabase_payloads_view_test
|
|
|
|
assignmentLimit: 10000
|
|
maxAttemptsPerRecord: 3
|
|
baseFilesLocation: /tmp/
|
|
workerReportsDirPath: /reports/workerReports/
|
|
parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/
|
|
s3:
|
|
endpoint: XA
|
|
accessKey: XA
|
|
secretKey: XA
|
|
region: XA
|
|
bucketName: XA
|
|
shouldEmptyBucket: false
|
|
shouldShowAllS3Buckets: true
|
|
|
|
|
|
bulk-import:
|
|
baseBulkImportLocation: /mnt/bulk_import/
|
|
bulkImportReportLocation: /reports/bulkImportReports/
|
|
bulkImportSources: # These sources are accepted for bulk-import requests and are excluded from crawling.
|
|
arxivImport:
|
|
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
|
datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
|
|
pdfUrlPrefix: https://arxiv.org/pdf/
|
|
mimeType: application/pdf
|
|
# otherImport:
|
|
# datasourceID: othersource__::0123
|
|
# datasourcePrefix: other_______
|
|
# pdfUrlPrefix: https://example.org/pdf/
|
|
# mimeType: application/pdf
|
|
|
|
|
|
spring:
|
|
application:
|
|
name: Urls_Controller
|
|
datasource:
|
|
driver-class-name: com.cloudera.impala.jdbc41.Driver
|
|
url: XA
|
|
username: ''
|
|
password: ''
|
|
hikari:
|
|
connectionTimeout: 30000
|
|
idleTimeout: 600000
|
|
maxLifetime: 1800000
|
|
maximumPoolSize: 20
|
|
minimumIdle: 4
|
|
pool-name: ControllerPool
|
|
output:
|
|
ansi:
|
|
enabled: always
|
|
lifecycle:
|
|
timeout-per-shutdown-phase: 2m
|
|
|
|
|
|
hdfs:
|
|
baseUrl: XA
|
|
userName: XA
|
|
password: XA
|
|
httpAuth: ''
|
|
# HTTP-Authorization --> Authorization: Basic Base64Encode(username:password)
|
|
# Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests)
|
|
# Or by giving the username AND the password, in order for the program to crete the auth-String programmatically.
|
|
# The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used.
|
|
|
|
parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/ # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts.
|
|
|
|
# Prometheus related config.
|
|
management:
|
|
endpoint:
|
|
health:
|
|
enabled: true
|
|
show-details: always
|
|
metrics:
|
|
enabled: true
|
|
prometheus:
|
|
enabled: true
|
|
endpoints:
|
|
web:
|
|
base-path: /actuator
|
|
exposure:
|
|
include: health,info,prometheus,metrics
|
|
metrics:
|
|
tags:
|
|
application: ${spring.application.name}
|
|
|
|
logging:
|
|
level:
|
|
root: INFO
|
|
eu:
|
|
openaire:
|
|
urls_controller: DEBUG
|
|
org:
|
|
springframework:
|
|
security: WARN
|
|
web: INFO
|
|
apache:
|
|
hadoop:
|
|
io:
|
|
compress: WARN
|