UrlsController/src/main/resources/application.yml

134 lines
5.0 KiB
YAML

server:
port: 1880
servlet:
context-path: /api
shutdown: graceful
services:
pdfaggregation:
controller:
isTestEnvironment: true
# In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist).
# The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist.
# In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables:
# "assignment", "attempt" and "payload", which are populated during execution.
db:
initialDatabaseName: pdfaggregation_i
testDatabaseName: pdfaggregationdatabase_test_new
assignmentLimit: 10000
maxAttemptsPerRecord: 3
numOfBackgroundThreads: 8
# This refers to the number of threads running in the background and processing workerReports and bulkImport procedures.
baseFilesLocation: /tmp/
workerReportsDirPath: /reports/workerReports/
parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/
s3:
endpoint: XA
accessKey: XA
secretKey: XA
region: XA
bucketName: XA
shouldEmptyBucket: false # For data-security, in order to use this ability, the relevant code has to be uncommented in "util.S3ObjectStore.java"
shouldShowAllS3Buckets: true
worker:
port: 1881
bulk-import:
baseBulkImportLocation: /mnt/bulk_import/
bulkImportReportLocation: /reports/bulkImportReports/
numOfThreadsForBulkImportProcedures: 6
bulkImportSources: # These sources are accepted for bulk-import requests and are excluded from crawling.
arxivImport:
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
pdfUrlPrefix: https://arxiv.org/pdf/
mimeType: application/pdf
isAuthoritative: true
# otherImport:
# datasourceID: othersource__::0123
# datasourcePrefix: other_______
# pdfUrlPrefix: https://example.org/pdf/
# mimeType: application/pdf
# isAuthoritative: false
# For "authoritative" sources, a special prefix is selected, from: https://graph.openaire.eu/docs/data-model/pids-and-identifiers/#identifiers-in-the-graph
# For the rest, the "datasource_prefix" is selected, using this query:
# select datasource.namespaceprefix.value
# from openaire_prod_<PROD_DATE>.datasource -- Here use the production-table with the latest date.
# where officialname.value = 'datasourceOfficialName';
spring:
application:
name: Urls_Controller
datasource:
driver-class-name: com.cloudera.impala.jdbc41.Driver
url: XA
username: ''
password: ''
hikari:
connectionTimeout: 30000
idleTimeout: 600000
maxLifetime: 1800000
maximumPoolSize: 20
minimumIdle: 4
pool-name: ControllerPool
output:
ansi:
enabled: always
lifecycle:
timeout-per-shutdown-phase: 5m
hdfs:
baseUrl: XA
userName: XA
password: XA
httpAuth: ''
# HTTP-Authorization --> Authorization: Basic Base64Encode(username:password)
# Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests)
# Or by giving the username AND the password, in order for the program to crete the auth-String programmatically.
# The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used.
parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/ # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts.
# Prometheus related config.
management:
endpoint:
health:
enabled: true
show-details: always
metrics:
enabled: true
prometheus:
enabled: true
endpoints:
web:
base-path: /actuator
exposure:
include: health,info,prometheus,metrics
metrics:
tags:
application: ${spring.application.name}
logging:
file:
path: logs
level:
root: INFO
eu:
openaire:
urls_controller: DEBUG
org:
springframework:
security: WARN
web: INFO
apache:
hadoop:
io:
compress: WARN