server: port: 1880 servlet: context-path: /api shutdown: graceful services: pdfaggregation: controller: isTestEnvironment: true # In case the "isTestEnvironment" is "true", the "testDatabase" below and all its tables are created (if not exist). # The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase", if they don't exist. # In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates, if they don't exist, the following tables: # "assignment", "attempt" and "payload", which are populated during execution. db: initialDatabaseName: pdfaggregation_i testDatabaseName: pdfaggregationdatabase_test_new assignmentLimit: 10000 maxAttemptsPerRecord: 3 numOfBackgroundThreads: 8 # This refers to the number of threads running in the background and processing workerReports and bulkImport procedures. baseFilesLocation: /tmp/ workerReportsDirPath: /reports/workerReports/ parquetLocalDirectoryPath: ${services.pdfaggregation.controller.baseFilesLocation}parquetFiles/ s3: endpoint: XA accessKey: XA secretKey: XA region: XA bucketName: XA shouldEmptyBucket: false # For data-security, in order to use this ability, the relevant code has to be uncommented in "util.S3ObjectStore.java" shouldShowAllS3Buckets: true worker: port: 1881 bulk-import: baseBulkImportLocation: /mnt/bulk_import/ bulkImportReportLocation: /reports/bulkImportReports/ numOfThreadsForBulkImportProcedures: 6 bulkImportSources: # These sources are accepted for bulk-import requests and are excluded from crawling. arxivImport: datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23 datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18") pdfUrlPrefix: https://arxiv.org/pdf/ mimeType: application/pdf isAuthoritative: true # otherImport: # datasourceID: othersource__::0123 # datasourcePrefix: other_______ # pdfUrlPrefix: https://example.org/pdf/ # mimeType: application/pdf # isAuthoritative: false # For "authoritative" sources, a special prefix is selected, from: https://graph.openaire.eu/docs/data-model/pids-and-identifiers/#identifiers-in-the-graph # For the rest, the "datasource_prefix" is selected, using this query: # select datasource.namespaceprefix.value # from openaire_prod_.datasource -- Here use the production-table with the latest date. # where officialname.value = 'datasourceOfficialName'; spring: application: name: Urls_Controller datasource: driver-class-name: com.cloudera.impala.jdbc41.Driver url: XA username: '' password: '' hikari: connectionTimeout: 30000 idleTimeout: 600000 maxLifetime: 1800000 maximumPoolSize: 20 minimumIdle: 4 pool-name: ControllerPool output: ansi: enabled: always lifecycle: timeout-per-shutdown-phase: 5m hdfs: baseUrl: XA userName: XA password: XA httpAuth: '' # HTTP-Authorization --> Authorization: Basic Base64Encode(username:password) # Give the credentials by either giving the Http-Auth-string AND the username (used as parameter in the WebHdfs-requests) # Or by giving the username AND the password, in order for the program to crete the auth-String programmatically. # The first approach is intended for more privacy, while the second for more ease. Either way, all three should be uncommented, no matter which ones are used. parquetRemoteBaseDirectoryPath: /tmp/parquet_uploads/ # In case the "isTestEnvironment" is "true", this is automatically supplemented by a "/test/" subdirectory, to avoid any conflicts. # Prometheus related config. management: endpoint: health: enabled: true show-details: always metrics: enabled: true prometheus: enabled: true endpoints: web: base-path: /actuator exposure: include: health,info,prometheus,metrics metrics: tags: application: ${spring.application.name} logging: file: path: logs level: root: INFO eu: openaire: urls_controller: DEBUG org: springframework: security: WARN web: INFO apache: hadoop: io: compress: WARN