forked from lsmyrnaios/UrlsWorker
Compare commits
115 Commits
Author | SHA1 | Date |
---|---|---|
Lampros Smyrnaios | b40c72f78f | |
Lampros Smyrnaios | 34407179fc | |
Lampros Smyrnaios | 7857ce1f05 | |
Lampros Smyrnaios | 795d6e7c93 | |
Lampros Smyrnaios | 736d0f8526 | |
Lampros Smyrnaios | cb736a8d66 | |
Lampros Smyrnaios | 5d7465df3c | |
Lampros Smyrnaios | 24c4a75acf | |
Lampros Smyrnaios | 50d756d582 | |
Lampros Smyrnaios | 3909104a1b | |
Lampros Smyrnaios | c4770ee716 | |
Lampros Smyrnaios | 066d6f665f | |
Lampros Smyrnaios | bad9544c58 | |
Lampros Smyrnaios | 5a9e7228ae | |
Lampros Smyrnaios | 9073f56227 | |
Lampros Smyrnaios | 69ea5b6d19 | |
Lampros Smyrnaios | bfa76e9484 | |
Lampros Smyrnaios | 10e39d79a4 | |
Lampros Smyrnaios | 1b45f384a7 | |
Lampros Smyrnaios | 01e378ea66 | |
Lampros Smyrnaios | 18cc9e0e68 | |
Lampros Smyrnaios | 2895668417 | |
Lampros Smyrnaios | 49cd0c19c2 | |
Lampros Smyrnaios | e85282d35b | |
Lampros Smyrnaios | b579296ada | |
Lampros Smyrnaios | dc97b323c9 | |
Lampros Smyrnaios | 088cf73b30 | |
Lampros Smyrnaios | 952bf7c035 | |
Lampros Smyrnaios | 33df46f6f5 | |
Lampros Smyrnaios | 9c897b8bf4 | |
Lampros Smyrnaios | 2aedae2367 | |
Lampros Smyrnaios | 4a95826f58 | |
Lampros Smyrnaios | 7f3ca80959 | |
Lampros Smyrnaios | a9b1b20a51 | |
Lampros Smyrnaios | 84f29ea7e0 | |
Lampros Smyrnaios | 0908dcab8a | |
Lampros Smyrnaios | 2b69733912 | |
Lampros Smyrnaios | f57314908a | |
Lampros Smyrnaios | 1bf27a5a4e | |
Lampros Smyrnaios | 0ca02f3587 | |
Lampros Smyrnaios | bfa569685a | |
Lampros Smyrnaios | 9fdaa9503b | |
Lampros Smyrnaios | 903032f454 | |
Lampros Smyrnaios | 9cb43b3d94 | |
Lampros Smyrnaios | 4d90846261 | |
Lampros Smyrnaios | bd0ead816d | |
Lampros Smyrnaios | 93d1aa9588 | |
Lampros Smyrnaios | cc55354e73 | |
Lampros Smyrnaios | 714938531b | |
Lampros Smyrnaios | 29a54f0b30 | |
Lampros Smyrnaios | 4eac7c5c66 | |
Lampros Smyrnaios | 0ea7bccadb | |
Lampros Smyrnaios | d5a997ad3d | |
Lampros Smyrnaios | 53ab51922a | |
Lampros Smyrnaios | fcd80a8f3f | |
Lampros Smyrnaios | 7b7dd59b57 | |
Lampros Smyrnaios | ec4d084972 | |
Lampros Smyrnaios | 0ba15dd31a | |
Lampros Smyrnaios | 344bc46e08 | |
Lampros Smyrnaios | 0997558347 | |
Lampros Smyrnaios | 796e46bc99 | |
Lampros Smyrnaios | 839a797124 | |
Lampros Smyrnaios | 4da54e7a7d | |
Lampros Smyrnaios | ec09ecc7ff | |
Lampros Smyrnaios | ba989484e4 | |
Lampros Smyrnaios | ff4fd3d289 | |
Lampros Smyrnaios | 66d3f7bcb2 | |
Lampros Smyrnaios | 81b61b530f | |
Lampros Smyrnaios | 84a37bd4b7 | |
Lampros Smyrnaios | 9888349bef | |
Lampros Smyrnaios | 0dd2b6c46f | |
Lampros Smyrnaios | 0626e85894 | |
Lampros Smyrnaios | 13f56d16c0 | |
Lampros Smyrnaios | b98ea92dec | |
Lampros Smyrnaios | 24b52fba63 | |
Lampros Smyrnaios | d6ff62d2ef | |
Lampros Smyrnaios | bd0d9eb36f | |
Lampros Smyrnaios | 7dd5719bff | |
Lampros Smyrnaios | c283cb4365 | |
Lampros Smyrnaios | d96d0c68cd | |
Lampros Smyrnaios | fd62ac567e | |
Lampros Smyrnaios | 778dc6e25c | |
Lampros Smyrnaios | 378db2ff2f | |
Lampros Smyrnaios | 8c1daadad0 | |
Lampros Smyrnaios | 6c17e86c70 | |
Lampros Smyrnaios | d37cd738a0 | |
Lampros Smyrnaios | 326af0f12d | |
Lampros Smyrnaios | 5f48f72f06 | |
Lampros Smyrnaios | 182d6153d4 | |
Lampros Smyrnaios | 01f12e2fe2 | |
Lampros Smyrnaios | 90a69686cf | |
Lampros Smyrnaios | 6450a4b8ac | |
Lampros Smyrnaios | 4b85b092fe | |
Lampros Smyrnaios | b051e10fd3 | |
Lampros Smyrnaios | 373bfa810b | |
Lampros Smyrnaios | d73a99b1c0 | |
Lampros Smyrnaios | 25070d7aba | |
Lampros Smyrnaios | 5035094e44 | |
Lampros Smyrnaios | d91732bc16 | |
Lampros Smyrnaios | 26cbb83b51 | |
Lampros Smyrnaios | d6e94912a4 | |
Lampros Smyrnaios | a1f750a0aa | |
Lampros Smyrnaios | d682298850 | |
Lampros Smyrnaios | 4976afa829 | |
Lampros Smyrnaios | 31af0a81eb | |
Lampros Smyrnaios | 5fee05e994 | |
Lampros Smyrnaios | 8453c742f2 | |
Lampros Smyrnaios | 760e0ef7e2 | |
Lampros Smyrnaios | 377b98d677 | |
Lampros Smyrnaios | edbf6461d5 | |
Lampros Smyrnaios | 0d2f0b8b01 | |
Lampros Smyrnaios | b63ad87d00 | |
Lampros Smyrnaios | 3d1faf4a8a | |
Lampros Smyrnaios | 4cadaf98fc | |
Lampros Smyrnaios | 73552ce079 |
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2021-2024 OpenAIRE AMKE
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
26
README.md
26
README.md
|
@ -1,17 +1,21 @@
|
|||
# UrlsWorker
|
||||
### [![Jenkins build status](https://jenkins-dnet.d4science.org/buildStatus/icon?job=UrlsWorker)](https://jenkins-dnet.d4science.org/job/UrlsWorker/)
|
||||
|
||||
This is the Worker's Application.<br>
|
||||
It requests assignments from the [controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them.<br>
|
||||
It posts the results to the controller, which in turn, puts them in a database.<br>
|
||||
The Worker's Application, requests assignments from the [**Controller**](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.<br>
|
||||
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.<br>
|
||||
The Worker responds by compressing and sending the requested files, in each batch.<br>
|
||||
<br>
|
||||
Multiple instances of this app are deployed on the cloud.<br>
|
||||
We use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
|
||||
<br>
|
||||
<br>
|
||||
|
||||
To install and run the application:
|
||||
**To install and run the application**:
|
||||
- Run ```git clone``` and then ```cd UrlsWorker```.
|
||||
- Create the file ```S3_minIO_credentials.txt``` , which contains just one line with the ___S3_url___, ___S3_username___, ___S3_password___, ___S3_server_region___ and the ___S3_bucket___, all separated by a _comma_ ```,```.
|
||||
- [Optional] Create the file ```inputData.txt``` , which contains just one line with the ___workerId___, the __maxAssignmentsLimitPerBatch__, the __maxAssignmentsBatchesToHandleBeforeRestart__ and the ___controller's base api-url___, all seperated by a _comma_ ```,``` . For example: ```worker_1,http://IP:PORT/api/```.
|
||||
- Execute the ```installAndRun.sh``` script. In case the above file (_inputData.txt_) does not exist, it will request the current ___worker's ID___, the __maxAssignmentsLimitPerBatch__, the __maxAssignmentsBatchesToHandleBeforeRestart__ and the ___Controller's Url___, and it will create the _inputData.txt_ file.<br>
|
||||
|
||||
Note: If the "maxAssignmentsBatchesToHandleBeforeRestart" is zero or negative, then an infinite number of assignments-batches will be handled.
|
||||
That script, installs the [PublicationsRetriever](https://github.com/LSmyrnaios/PublicationsRetriever), as a library and then compiles and runs the whole Application.<br>
|
||||
If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```.<br>
|
||||
- Set the preferable values inside the [__application.properties__](https://code-repo.d4science.org/lsmyrnaios/UrlsWorker/src/branch/master/src/main/resources/application.properties) file.
|
||||
- Execute the ```installAndRun.sh``` script.<br>
|
||||
<br>
|
||||
|
||||
**Notes**:
|
||||
- If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```. In this scenario, the SpringBoot-app will not be re-built.<br>
|
||||
<br>
|
||||
|
|
44
build.gradle
44
build.gradle
|
@ -1,18 +1,25 @@
|
|||
plugins {
|
||||
id 'org.springframework.boot' version '2.6.3'
|
||||
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
||||
id 'org.springframework.boot' version '2.7.18'
|
||||
id 'io.spring.dependency-management' version '1.1.4'
|
||||
id 'java'
|
||||
}
|
||||
|
||||
group = 'eu.openaire.urls_worker'
|
||||
version = '1.0.0-SNAPSHOT'
|
||||
sourceCompatibility = '1.8'
|
||||
java {
|
||||
group = 'eu.openaire.urls_worker'
|
||||
version = '2.1.9-SNAPSHOT'
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenCentral()
|
||||
flatDir {
|
||||
dirs 'libs'
|
||||
maven {
|
||||
name "d4science"
|
||||
url "https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots/"
|
||||
}
|
||||
// Enable the following, in case you want to test temporal-jars.
|
||||
/*flatDir {
|
||||
dirs 'libs'
|
||||
}*/
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
@ -24,20 +31,31 @@ dependencies {
|
|||
implementation("org.springframework.security:spring-security-core")
|
||||
implementation("org.springframework.security:spring-security-web")
|
||||
implementation("org.springframework.security:spring-security-config")
|
||||
//implementation("io.jsonwebtoken:jjwt:0.9.1") // Use this in case we use auth-tokens later on.
|
||||
|
||||
implementation "org.projectlombok:lombok:1.18.22"
|
||||
//implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on.
|
||||
|
||||
// Enable the validation annotations.
|
||||
//implementation group: 'javax.validation', name: 'validation-api', version: '2.0.1.Final'
|
||||
//implementation group: 'jakarta.validation', name: 'jakarta.validation-api', version: '3.0.2'
|
||||
|
||||
implementation ("eu.openaire:publications_retriever:1.0-SNAPSHOT") {
|
||||
implementation ("eu.openaire:publications_retriever:1.2-SNAPSHOT") {
|
||||
exclude group: 'ch.qos.logback', module: 'logback-core'
|
||||
exclude group: 'ch.qos.logback', module: 'logback-classic'
|
||||
exclude group: 'org.slf4j', module: 'slf4j-api'
|
||||
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
||||
}
|
||||
|
||||
testImplementation group: 'org.springframework.security', name: 'spring-security-test'
|
||||
implementation group: 'com.google.guava', name: 'guava', version: '33.1.0-jre'
|
||||
|
||||
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
||||
implementation 'com.google.code.gson:gson:2.10.1'
|
||||
|
||||
implementation("org.apache.commons:commons-compress:1.26.1") {
|
||||
exclude group: 'com.github.luben', module: 'zstd-jni'
|
||||
}
|
||||
implementation 'com.github.luben:zstd-jni:1.5.6-3' // Even though this is part of the above dependency, the Apache commons rarely updates it, while the zstd team makes improvements very often.
|
||||
// Also, for compressing, we strangely need it to be explicitly declared independently, otherwise it does not work.
|
||||
|
||||
testImplementation 'org.springframework.security:spring-security-test'
|
||||
testImplementation "org.springframework.boot:spring-boot-starter-test"
|
||||
}
|
||||
|
||||
|
@ -47,7 +65,7 @@ configurations {
|
|||
}
|
||||
|
||||
// Set increased lower and upper limits for the java-execution.
|
||||
tasks.withType(JavaExec) {
|
||||
tasks.withType(JavaExec).configureEach {
|
||||
jvmArgs = ['-Xms512m', '-Xmx8g']
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# This script adds swap in the System.
|
||||
|
||||
# Info found in: https://www.digitalocean.com/community/tutorials/how-to-add-swap-space-on-ubuntu-20-04
|
||||
|
||||
|
||||
# Creating a Swap File with 6 Gb
|
||||
sudo fallocate -l 6G /swapfile
|
||||
ls -lh /swapfile
|
||||
|
||||
# Enabling the Swap File
|
||||
sudo chmod 600 /swapfile
|
||||
ls -lh /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
sudo swapon --show
|
||||
free -h
|
||||
|
||||
# Making the Swap File Permanent
|
||||
sudo cp /etc/fstab /etc/fstab.bak
|
||||
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
|
||||
|
||||
# Adjusting the Swappiness Property
|
||||
cat /proc/sys/vm/swappiness
|
||||
sudo sysctl vm.swappiness=10 # Setting it closer to 0, makes the swaps more rare: 10%. The default is 60%.
|
||||
echo "vm.swappiness=10" | sudo tee -a /etc/sysctl.conf # Append it to that file in order for the custom "swappiness" to be available on restart.
|
|
@ -0,0 +1,4 @@
|
|||
org.gradle.caching=true
|
||||
org.gradle.parallel=true
|
||||
org.gradle.caching.debug=false
|
||||
org.gradle.warning.mode=all
|
Binary file not shown.
|
@ -1,5 +1,7 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
|
||||
networkTimeout=10000
|
||||
validateDistributionUrl=true
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
|
|
@ -0,0 +1,249 @@
|
|||
#!/bin/sh
|
||||
|
||||
#
|
||||
# Copyright © 2015-2021 the original authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Gradle start up script for POSIX generated by Gradle.
|
||||
#
|
||||
# Important for running:
|
||||
#
|
||||
# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
|
||||
# noncompliant, but you have some other compliant shell such as ksh or
|
||||
# bash, then to run this script, type that shell name before the whole
|
||||
# command line, like:
|
||||
#
|
||||
# ksh Gradle
|
||||
#
|
||||
# Busybox and similar reduced shells will NOT work, because this script
|
||||
# requires all of these POSIX shell features:
|
||||
# * functions;
|
||||
# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
|
||||
# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
|
||||
# * compound commands having a testable exit status, especially «case»;
|
||||
# * various built-in commands including «command», «set», and «ulimit».
|
||||
#
|
||||
# Important for patching:
|
||||
#
|
||||
# (2) This script targets any POSIX shell, so it avoids extensions provided
|
||||
# by Bash, Ksh, etc; in particular arrays are avoided.
|
||||
#
|
||||
# The "traditional" practice of packing multiple parameters into a
|
||||
# space-separated string is a well documented source of bugs and security
|
||||
# problems, so this is (mostly) avoided, by progressively accumulating
|
||||
# options in "$@", and eventually passing that to Java.
|
||||
#
|
||||
# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
|
||||
# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
|
||||
# see the in-line comments for details.
|
||||
#
|
||||
# There are tweaks for specific operating systems such as AIX, CygWin,
|
||||
# Darwin, MinGW, and NonStop.
|
||||
#
|
||||
# (3) This script is generated from the Groovy template
|
||||
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
|
||||
# within the Gradle project.
|
||||
#
|
||||
# You can find Gradle at https://github.com/gradle/gradle/.
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
# Attempt to set APP_HOME
|
||||
|
||||
# Resolve links: $0 may be a link
|
||||
app_path=$0
|
||||
|
||||
# Need this for daisy-chained symlinks.
|
||||
while
|
||||
APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
|
||||
[ -h "$app_path" ]
|
||||
do
|
||||
ls=$( ls -ld "$app_path" )
|
||||
link=${ls#*' -> '}
|
||||
case $link in #(
|
||||
/*) app_path=$link ;; #(
|
||||
*) app_path=$APP_HOME$link ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# This is normally unused
|
||||
# shellcheck disable=SC2034
|
||||
APP_BASE_NAME=${0##*/}
|
||||
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
|
||||
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
|
||||
|
||||
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||
MAX_FD=maximum
|
||||
|
||||
warn () {
|
||||
echo "$*"
|
||||
} >&2
|
||||
|
||||
die () {
|
||||
echo
|
||||
echo "$*"
|
||||
echo
|
||||
exit 1
|
||||
} >&2
|
||||
|
||||
# OS specific support (must be 'true' or 'false').
|
||||
cygwin=false
|
||||
msys=false
|
||||
darwin=false
|
||||
nonstop=false
|
||||
case "$( uname )" in #(
|
||||
CYGWIN* ) cygwin=true ;; #(
|
||||
Darwin* ) darwin=true ;; #(
|
||||
MSYS* | MINGW* ) msys=true ;; #(
|
||||
NONSTOP* ) nonstop=true ;;
|
||||
esac
|
||||
|
||||
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||
|
||||
|
||||
# Determine the Java command to use to start the JVM.
|
||||
if [ -n "$JAVA_HOME" ] ; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD=$JAVA_HOME/jre/sh/java
|
||||
else
|
||||
JAVACMD=$JAVA_HOME/bin/java
|
||||
fi
|
||||
if [ ! -x "$JAVACMD" ] ; then
|
||||
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
else
|
||||
JAVACMD=java
|
||||
if ! command -v java >/dev/null 2>&1
|
||||
then
|
||||
die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
fi
|
||||
|
||||
# Increase the maximum file descriptors if we can.
|
||||
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
|
||||
case $MAX_FD in #(
|
||||
max*)
|
||||
# In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
|
||||
# shellcheck disable=SC2039,SC3045
|
||||
MAX_FD=$( ulimit -H -n ) ||
|
||||
warn "Could not query maximum file descriptor limit"
|
||||
esac
|
||||
case $MAX_FD in #(
|
||||
'' | soft) :;; #(
|
||||
*)
|
||||
# In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
|
||||
# shellcheck disable=SC2039,SC3045
|
||||
ulimit -n "$MAX_FD" ||
|
||||
warn "Could not set maximum file descriptor limit to $MAX_FD"
|
||||
esac
|
||||
fi
|
||||
|
||||
# Collect all arguments for the java command, stacking in reverse order:
|
||||
# * args from the command line
|
||||
# * the main class name
|
||||
# * -classpath
|
||||
# * -D...appname settings
|
||||
# * --module-path (only if needed)
|
||||
# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
|
||||
|
||||
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||
if "$cygwin" || "$msys" ; then
|
||||
APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
|
||||
CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
|
||||
|
||||
JAVACMD=$( cygpath --unix "$JAVACMD" )
|
||||
|
||||
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||
for arg do
|
||||
if
|
||||
case $arg in #(
|
||||
-*) false ;; # don't mess with options #(
|
||||
/?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
|
||||
[ -e "$t" ] ;; #(
|
||||
*) false ;;
|
||||
esac
|
||||
then
|
||||
arg=$( cygpath --path --ignore --mixed "$arg" )
|
||||
fi
|
||||
# Roll the args list around exactly as many times as the number of
|
||||
# args, so each arg winds up back in the position where it started, but
|
||||
# possibly modified.
|
||||
#
|
||||
# NB: a `for` loop captures its iteration list before it begins, so
|
||||
# changing the positional parameters here affects neither the number of
|
||||
# iterations, nor the values presented in `arg`.
|
||||
shift # remove old arg
|
||||
set -- "$@" "$arg" # push replacement arg
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||
|
||||
# Collect all arguments for the java command:
|
||||
# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
|
||||
# and any embedded shellness will be escaped.
|
||||
# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
|
||||
# treated as '${Hostname}' itself on the command line.
|
||||
|
||||
set -- \
|
||||
"-Dorg.gradle.appname=$APP_BASE_NAME" \
|
||||
-classpath "$CLASSPATH" \
|
||||
org.gradle.wrapper.GradleWrapperMain \
|
||||
"$@"
|
||||
|
||||
# Stop when "xargs" is not available.
|
||||
if ! command -v xargs >/dev/null 2>&1
|
||||
then
|
||||
die "xargs is not available"
|
||||
fi
|
||||
|
||||
# Use "xargs" to parse quoted args.
|
||||
#
|
||||
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
|
||||
#
|
||||
# In Bash we could simply go:
|
||||
#
|
||||
# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
|
||||
# set -- "${ARGS[@]}" "$@"
|
||||
#
|
||||
# but POSIX shell has neither arrays nor command substitution, so instead we
|
||||
# post-process each arg (as a line of input to sed) to backslash-escape any
|
||||
# character that might be a shell metacharacter, then use eval to reverse
|
||||
# that process (while maintaining the separation between arguments), and wrap
|
||||
# the whole thing up as a single "set" statement.
|
||||
#
|
||||
# This will of course break if any of these variables contains a newline or
|
||||
# an unmatched quote.
|
||||
#
|
||||
|
||||
eval "set -- $(
|
||||
printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
|
||||
xargs -n1 |
|
||||
sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
|
||||
tr '\n' ' '
|
||||
)" '"$@"'
|
||||
|
||||
exec "$JAVACMD" "$@"
|
|
@ -0,0 +1,92 @@
|
|||
@rem
|
||||
@rem Copyright 2015 the original author or authors.
|
||||
@rem
|
||||
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@rem you may not use this file except in compliance with the License.
|
||||
@rem You may obtain a copy of the License at
|
||||
@rem
|
||||
@rem https://www.apache.org/licenses/LICENSE-2.0
|
||||
@rem
|
||||
@rem Unless required by applicable law or agreed to in writing, software
|
||||
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@rem See the License for the specific language governing permissions and
|
||||
@rem limitations under the License.
|
||||
@rem
|
||||
|
||||
@if "%DEBUG%"=="" @echo off
|
||||
@rem ##########################################################################
|
||||
@rem
|
||||
@rem Gradle startup script for Windows
|
||||
@rem
|
||||
@rem ##########################################################################
|
||||
|
||||
@rem Set local scope for the variables with windows NT shell
|
||||
if "%OS%"=="Windows_NT" setlocal
|
||||
|
||||
set DIRNAME=%~dp0
|
||||
if "%DIRNAME%"=="" set DIRNAME=.
|
||||
@rem This is normally unused
|
||||
set APP_BASE_NAME=%~n0
|
||||
set APP_HOME=%DIRNAME%
|
||||
|
||||
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
||||
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
||||
|
||||
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
||||
|
||||
@rem Find java.exe
|
||||
if defined JAVA_HOME goto findJavaFromJavaHome
|
||||
|
||||
set JAVA_EXE=java.exe
|
||||
%JAVA_EXE% -version >NUL 2>&1
|
||||
if %ERRORLEVEL% equ 0 goto execute
|
||||
|
||||
echo. 1>&2
|
||||
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
|
||||
echo. 1>&2
|
||||
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
|
||||
echo location of your Java installation. 1>&2
|
||||
|
||||
goto fail
|
||||
|
||||
:findJavaFromJavaHome
|
||||
set JAVA_HOME=%JAVA_HOME:"=%
|
||||
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
||||
|
||||
if exist "%JAVA_EXE%" goto execute
|
||||
|
||||
echo. 1>&2
|
||||
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
|
||||
echo. 1>&2
|
||||
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
|
||||
echo location of your Java installation. 1>&2
|
||||
|
||||
goto fail
|
||||
|
||||
:execute
|
||||
@rem Setup the command line
|
||||
|
||||
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
||||
|
||||
|
||||
@rem Execute Gradle
|
||||
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
|
||||
|
||||
:end
|
||||
@rem End local scope for the variables with windows NT shell
|
||||
if %ERRORLEVEL% equ 0 goto mainEnd
|
||||
|
||||
:fail
|
||||
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
||||
rem the _cmd.exe /c_ return code!
|
||||
set EXIT_CODE=%ERRORLEVEL%
|
||||
if %EXIT_CODE% equ 0 set EXIT_CODE=1
|
||||
if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
|
||||
exit /b %EXIT_CODE%
|
||||
|
||||
:mainEnd
|
||||
if "%OS%"=="Windows_NT" endlocal
|
||||
|
||||
:omega
|
|
@ -1,85 +1,56 @@
|
|||
cd "${0%/*}" || (echo "Could not chdir to this script's dir!" && exit) # Change the working directory to the script's directory, when running from other location.
|
||||
# This script installs and runs the project.
|
||||
# It also sets the "max-heap-size", depending on the machine's memory.
|
||||
|
||||
justInstall=0
|
||||
# For error-handling, we cannot use the "set -e" since: it has problems https://mywiki.wooledge.org/BashFAQ/105
|
||||
# So we have our own function, for use when a single command fails.
|
||||
handle_error () {
|
||||
echo -e "$1"; exit $2
|
||||
}
|
||||
|
||||
# Change the working directory to the script's directory, when running from another location.
|
||||
cd "${0%/*}" || handle_error "Could not change-dir to this script's dir!" 1
|
||||
|
||||
justRun=0
|
||||
|
||||
if [[ $# -eq 1 ]]; then
|
||||
justInstall=$1
|
||||
justRun=$1
|
||||
elif [[ $# -gt 1 ]]; then
|
||||
echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh <justInstall: 0 | 1>"; exit 1
|
||||
echo -e "Wrong number of arguments given: ${#} (more than 1)\nPlease execute it like: script.sh <justRun: 0 | 1>"; exit 2
|
||||
fi
|
||||
|
||||
# Check of the "inputData.txt" file exist, if not, ask to fill it.
|
||||
inputDataFile="inputData.txt"
|
||||
gradleVersion="8.7"
|
||||
|
||||
if [[ ! -f $inputDataFile ]]; then
|
||||
echo -e "The file \"$inputDataFile\" does not exist. Going to create it..\n"
|
||||
shouldBeCarefulWithMaxHeap=0 # This is NOT a cmd-arg.
|
||||
|
||||
echo "Give the ID of this worker:"
|
||||
read -r workerId
|
||||
|
||||
echo -e "\nGive the max-assignments-limit-per-batch for the Worker to handle: "
|
||||
read -r maxAssignmentsLimitPerBatch
|
||||
|
||||
echo -e "\nGive the max-assignments-batches to handle before restart: "
|
||||
read -r maxAssignmentsBatchesToHandleBeforeRestart
|
||||
|
||||
echo -e "\nGive the baseUrl of the controller (e.g.: http://IP:PORT/api/):"
|
||||
read -r controllerBaseUrl
|
||||
|
||||
touch $inputDataFile
|
||||
echo "$workerId,$maxAssignmentsLimitPerBatch,$maxAssignmentsBatchesToHandleBeforeRestart,$controllerBaseUrl" >> $inputDataFile
|
||||
echo -e "\n\n"
|
||||
fi
|
||||
|
||||
gradleVersion="7.3.3"
|
||||
|
||||
if [[ justInstall -eq 0 ]]; then
|
||||
|
||||
if [ ! -d libs ]; then
|
||||
mkdir libs || (echo -e "The directory \"libs\" could not be created! Exiting.." && exit 2)
|
||||
fi
|
||||
|
||||
cd libs || exit 3
|
||||
git clone https://github.com/LSmyrnaios/PublicationsRetriever.git # We assume there is no previously source-code here, if so, it will be overwritten.
|
||||
|
||||
# Keep a backup of the existing JAR file.
|
||||
mv ./publications_retriever-1.0-SNAPSHOT.jar ./publications_retriever-1.0-SNAPSHOT_BACKUP.jar
|
||||
|
||||
cd PublicationsRetriever && sudo apt install -y maven && mvn clean install
|
||||
|
||||
# Copy the created JAR file to the top libs directory.
|
||||
cp target/publications_retriever-1.0-SNAPSHOT.jar ../publications_retriever-1.0-SNAPSHOT.jar
|
||||
|
||||
# Delete the directory with the source-code.
|
||||
cd ../ && rm -rf PublicationsRetriever
|
||||
|
||||
# Clean, (re)build and run the project.
|
||||
cd ../
|
||||
if [[ justRun -eq 0 ]]; then
|
||||
|
||||
# Install the specified Gradle-build-tool version, if it does not exist.
|
||||
if [[ ! -d /opt/gradle/gradle-${gradleVersion} ]]; then
|
||||
wget https://services.gradle.org/distributions/gradle-${gradleVersion}-bin.zip
|
||||
echo -e "\nAsking for sudo, in order to install 'gradle'..\n"
|
||||
sudo mkdir /opt/gradle
|
||||
sudo apt install -y unzip && sudo unzip -d /opt/gradle gradle-${gradleVersion}-bin.zip
|
||||
sudo rm -rf gradle-${gradleVersion}-bin.zip
|
||||
#ls /opt/gradle/gradle-${gradleVersion} # For debugging installation
|
||||
fi
|
||||
|
||||
export PATH=/opt/gradle/gradle-${gradleVersion}/bin:$PATH
|
||||
|
||||
# Update the max-heap-size based on the machine's physical memory.
|
||||
machine_memory_mb=$(grep MemTotal /proc/meminfo | awk '{print $2}' | xargs -I {} echo "scale=4; {}/1024" | bc) # It returns the size in MB.
|
||||
max_heap_size_mb=$(echo "($machine_memory_mb - 896)/1" | bc) # Leave 896 MB to the system (the "()/1" is used to take the floor value).
|
||||
# Now, we replace the "-Xmx" parameter inside the "./build.gradle" file, with "-Xmx${max_heap_size}m"
|
||||
echo -e "\n\nThe max-heap-size (-Xmx) will be set to: ${max_heap_size_mb}m\n\n"
|
||||
sed -i "s/'-Xmx[0-9]\+[gm]'/'-Xmx${max_heap_size_mb}m'/g" ./build.gradle
|
||||
if [[ shouldBeCarefulWithMaxHeap -eq 1 ]]; then
|
||||
# Update the max-heap-size based on the machine's physical memory.
|
||||
machine_memory_mb=$(grep MemTotal /proc/meminfo | awk '{print $2}' | xargs -I {} echo "scale=4; {}/1024" | bc) # It returns the size in MB.
|
||||
max_heap_size_mb=$(echo "($machine_memory_mb - 800)/1" | bc) # Leave 800 MB to the system (the "()/1" is used to take the floor value).
|
||||
# Now, we replace the "-Xmx" parameter inside the "./build.gradle" file, with "-Xmx${max_heap_size}m"
|
||||
echo -e "\n\nThe max-heap-size (-Xmx) will be set to: ${max_heap_size_mb}m\n\n"
|
||||
sed -i "s/'-Xmx[0-9]\+[gm]'/'-Xmx${max_heap_size_mb}m'/g" ./build.gradle
|
||||
fi
|
||||
|
||||
gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin
|
||||
|
||||
#gradle tasks # For debugging installation
|
||||
#gradle -v # For debugging installation
|
||||
|
||||
gradle clean
|
||||
gradle build
|
||||
gradle clean build # --refresh-dependencies # --info
|
||||
else
|
||||
export PATH=/opt/gradle/gradle-${gradleVersion}/bin:$PATH # Make sure the gradle is still accessible (it usually isn't without the "export").
|
||||
fi
|
||||
|
|
|
@ -1,18 +1,16 @@
|
|||
package eu.openaire.urls_worker;
|
||||
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||
import eu.openaire.urls_worker.components.ScheduledTasks;
|
||||
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.util.AssignmentsHandler;
|
||||
import eu.openaire.urls_worker.controllers.FullTextsController;
|
||||
import eu.openaire.urls_worker.util.UriBuilder;
|
||||
import eu.openaire.urls_worker.util.WorkerConstants;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.boot.web.servlet.context.ServletWebServerApplicationContext;
|
||||
import org.springframework.context.ConfigurableApplicationContext;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.core.env.Environment;
|
||||
|
@ -22,12 +20,8 @@ import org.springframework.web.cors.CorsConfigurationSource;
|
|||
import org.springframework.web.cors.UrlBasedCorsConfigurationSource;
|
||||
|
||||
import javax.annotation.PreDestroy;
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Scanner;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
|
||||
|
@ -38,33 +32,38 @@ public class UrlsWorkerApplication {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UrlsWorkerApplication.class);
|
||||
|
||||
private static final String inputDataFilePath = FileUtils.workingDir + "inputData.txt";
|
||||
public static String workerId = null;
|
||||
public static int maxAssignmentsLimitPerBatch = 0;
|
||||
public static int maxAssignmentsBatchesToHandleBeforeRestart = -1; // Default value: -1 = argument-absent, 0 = infinite-batches
|
||||
public static String controllerBaseUrl = null; // BaseUrl template: "http://IP:PORT/api/"
|
||||
private static String workerId;
|
||||
|
||||
private static ConfigurableApplicationContext context;
|
||||
|
||||
|
||||
public UrlsWorkerApplication(@Value("${info.workerId}") String workerId)
|
||||
{
|
||||
UrlsWorkerApplication.workerId = workerId;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
setInputData(); // This may cause the Server to terminate early, in case the workerId or the controllerBaseUrl cannot be found.
|
||||
new PublicationsRetrieverPlugin();
|
||||
new AssignmentsHandler();
|
||||
|
||||
context = SpringApplication.run(UrlsWorkerApplication.class, args);
|
||||
|
||||
Runtime javaRuntime = Runtime.getRuntime();
|
||||
logger.debug("HeapSize: " + javaRuntime.totalMemory());
|
||||
logger.debug("HeapMaxSize: " + javaRuntime.maxMemory());
|
||||
logger.debug("HeapFreeSize: " + javaRuntime.freeMemory());
|
||||
int mb = 1048576;
|
||||
logger.debug("HeapSize: " + (javaRuntime.totalMemory() / mb) + " mb");
|
||||
logger.debug("HeapMaxSize: " + (javaRuntime.maxMemory() / mb) + " mb");
|
||||
logger.debug("HeapFreeSize: " + (javaRuntime.freeMemory() / mb) + " mb");
|
||||
|
||||
logger.info("The Worker has started running. Its id is: \"" + workerId + "\".");
|
||||
}
|
||||
|
||||
|
||||
public static void gentleAppShutdown()
|
||||
{
|
||||
int exitCode = SpringApplication.exit(context, () -> 0); // The "PreDestroy" method will be called.
|
||||
int exitCode = 0;
|
||||
try {
|
||||
exitCode = SpringApplication.exit(context, () -> 0); // The "PreDestroy" method will be called. (the "context" will be closed automatically (I checked it))
|
||||
} catch (IllegalArgumentException iae) {
|
||||
logger.error(iae.getMessage()); // This will say "Context must not be null", in case the "gentleAppShutdown()" was called too early in the app's lifetime. But it's ok.
|
||||
}
|
||||
System.exit(exitCode);
|
||||
}
|
||||
|
||||
|
@ -77,7 +76,7 @@ public class UrlsWorkerApplication {
|
|||
logger.info("Shutting down the threads used by \"PublicationsRetriever\"-plugin..");
|
||||
PublicationsRetriever.executor.shutdown(); // Define that no new tasks will be scheduled.
|
||||
try {
|
||||
if ( !PublicationsRetriever.executor.awaitTermination(1, TimeUnit.MINUTES) ) {
|
||||
if ( !PublicationsRetriever.executor.awaitTermination(2, TimeUnit.MINUTES) ) {
|
||||
logger.warn("The working threads did not finish on time! Stopping them immediately..");
|
||||
PublicationsRetriever.executor.shutdownNow();
|
||||
}
|
||||
|
@ -92,9 +91,11 @@ public class UrlsWorkerApplication {
|
|||
}
|
||||
}
|
||||
|
||||
ScheduledTasks.deleteHandledAssignmentsFullTexts();
|
||||
FullTextsController.deleteAssignmentsDirectory(-1, null);
|
||||
logger.info("Exiting..");
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public CorsConfigurationSource corsConfigurationSource() {
|
||||
CorsConfiguration configuration = new CorsConfiguration();
|
||||
|
@ -107,80 +108,11 @@ public class UrlsWorkerApplication {
|
|||
return source;
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public CommandLineRunner setServerBaseUrl(Environment environment)
|
||||
public CommandLineRunner setServerBaseUrl(Environment environment, ServletWebServerApplicationContext webServerAppCtxt)
|
||||
{
|
||||
return args -> new UriBuilder(environment);
|
||||
}
|
||||
|
||||
|
||||
private static void setInputData()
|
||||
{
|
||||
// Take the workerId and the controllerBaseUrl from the file.
|
||||
Scanner myReader = null;
|
||||
try {
|
||||
File inputDataFile = new File(inputDataFilePath);
|
||||
if ( !inputDataFile.exists() ) {
|
||||
String errorMsg = "controllerBaseUrlFile \"" + inputDataFilePath + "\" does not exists!";
|
||||
logger.error(errorMsg);
|
||||
System.err.println(errorMsg);
|
||||
System.exit(60);
|
||||
}
|
||||
myReader = new Scanner(inputDataFile);
|
||||
if ( myReader.hasNextLine() ) {
|
||||
String[] data = myReader.nextLine().split(",");
|
||||
if ( data.length < 4 ) {
|
||||
String errorMsg = "Not all data were retrieved from file \"" + inputDataFilePath + "\"!";
|
||||
logger.error(errorMsg);
|
||||
System.err.println(errorMsg);
|
||||
System.exit(61);
|
||||
}
|
||||
workerId = data[0].trim();
|
||||
String maxAssignmentsLimitStr = data[1].trim();
|
||||
try {
|
||||
maxAssignmentsLimitPerBatch = Integer.parseInt(maxAssignmentsLimitStr);
|
||||
} catch (NumberFormatException nfe) {
|
||||
logger.warn("The given \"maxAssignmentsLimitPerBatch\" (" + maxAssignmentsLimitStr + ") was not a number! Will use the default one: " + WorkerConstants.ASSIGNMENTS_LIMIT);
|
||||
maxAssignmentsLimitPerBatch = WorkerConstants.ASSIGNMENTS_LIMIT;
|
||||
}
|
||||
String maxAssignmentsBatchesStr = data[2].trim();
|
||||
try {
|
||||
maxAssignmentsBatchesToHandleBeforeRestart = Integer.parseInt(maxAssignmentsBatchesStr);
|
||||
} catch (NumberFormatException nfe) {
|
||||
logger.warn("The given \"maxAssignmentsBatchesToHandleBeforeRestart\" (" + maxAssignmentsBatchesStr + ") was not a number! Will handle an infinite number of batches!");
|
||||
maxAssignmentsBatchesToHandleBeforeRestart = 0;
|
||||
}
|
||||
controllerBaseUrl = data[3].trim();
|
||||
try {
|
||||
new URL(controllerBaseUrl);
|
||||
} catch (MalformedURLException mue) {
|
||||
String errorMsg = "The given \"controllerBaseUrl\" (\"" + controllerBaseUrl + "\") was malformed! Please restart the program and give a valid URL.";
|
||||
logger.error(errorMsg);
|
||||
System.err.println(errorMsg);
|
||||
System.exit(62);
|
||||
}
|
||||
if ( !controllerBaseUrl.endsWith("/") )
|
||||
controllerBaseUrl += "/"; // Make sure the other urls will not break later.
|
||||
}
|
||||
|
||||
if ( (workerId == null) || (maxAssignmentsLimitPerBatch == 0) || (maxAssignmentsBatchesToHandleBeforeRestart == -1) || (controllerBaseUrl == null) ) {
|
||||
String errorMsg = "No \"workerId\" or/and \"maxAssignmentsLimitPerBatch\" or/and \"maxAssignmentsBatchesToHandleBeforeRestart\" or/and \"controllerBaseUrl\" could be retrieved from the file: " + inputDataFilePath;
|
||||
logger.error(errorMsg);
|
||||
System.err.println(errorMsg);
|
||||
System.exit(63);
|
||||
}
|
||||
|
||||
logger.info("workerId: " + workerId + ", maxAssignmentsLimitPerBatch: " + maxAssignmentsLimitPerBatch + ", maxAssignmentsBatchesToHandleBeforeRestart: " + maxAssignmentsBatchesToHandleBeforeRestart + ", controllerBaseUrl: " + controllerBaseUrl); // It's safe and helpful to show them in the logs.
|
||||
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "An error prevented the retrieval of the workerId and the controllerBaseUrl from the file: " + inputDataFilePath + "\n" + e.getMessage();
|
||||
logger.error(errorMsg, e);
|
||||
System.err.println(errorMsg);
|
||||
System.exit(64);
|
||||
} finally {
|
||||
if ( myReader != null )
|
||||
myReader.close();
|
||||
}
|
||||
return args -> new UriBuilder(environment, webServerAppCtxt);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,398 @@
|
|||
package eu.openaire.urls_worker.components;
|
||||
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.Multimap;
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.util.url.GenericUtils;
|
||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
import eu.openaire.urls_worker.components.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.controllers.GeneralController;
|
||||
import eu.openaire.urls_worker.models.Assignment;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
import eu.openaire.urls_worker.payloads.requests.AssignmentsRequest;
|
||||
import eu.openaire.urls_worker.payloads.responces.WorkerReport;
|
||||
import eu.openaire.urls_worker.util.FilesCompressor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.web.client.RestTemplateBuilder;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.client.HttpServerErrorException;
|
||||
import org.springframework.web.client.RestClientException;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
|
||||
@Component
|
||||
public class AssignmentsHandler {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AssignmentsHandler.class);
|
||||
|
||||
@Autowired
|
||||
PublicationsRetrieverPlugin publicationsRetrieverPlugin;
|
||||
|
||||
private final String workerId;
|
||||
private final String controllerBaseUrl;
|
||||
private final int maxAssignmentsLimitPerBatch;
|
||||
public final int maxAssignmentsBatchesToHandleBeforeShutdown;
|
||||
|
||||
public static List<UrlReport> urlReports = null;
|
||||
private static final int expectedDatasourcesPerRequest = 1400; // Per 10_000 assignments.
|
||||
public static Multimap<String, Assignment> assignmentsForPlugins = null;
|
||||
private static final boolean askForTest = false; // Enable this only for testing.
|
||||
private static String requestUrl;
|
||||
|
||||
public static final RestTemplate restTemplate = new RestTemplateBuilder().setConnectTimeout(Duration.ofMinutes(2)).setReadTimeout(Duration.ofHours(1)).build();
|
||||
|
||||
public static boolean hadConnectionErrorOnRequest = false;
|
||||
|
||||
public static long numHandledAssignmentsBatches = 0; // No need to be synchronized.
|
||||
|
||||
public static final long idUrlsToHandleBeforeClearingDomainAndPathBlockingData = 300_000;
|
||||
public static long timesClearingDomainAndPathBlockingData = 0;
|
||||
|
||||
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 600_000;
|
||||
public static long timesClearingDomainAndPathTrackingData = 0;
|
||||
|
||||
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000;
|
||||
public static long timesClearingDuplicateUrlsData = 0;
|
||||
|
||||
public String workerReportsDirPath;
|
||||
|
||||
|
||||
public AssignmentsHandler(@Value("${info.workerId}") String workerId, @Value("${info.maxAssignmentsLimitPerBatch}") int maxAssignmentsLimitPerBatch,
|
||||
@Value("${info.maxAssignmentsBatchesToHandleBeforeShutdown}") int maxAssignmentsBatchesToHandleBeforeShutdown,
|
||||
@Value("${info.controllerBaseUrl}") String controllerBaseUrl,
|
||||
@Value("${workerReportsDirPath}") String workerReportsDirPath)
|
||||
{
|
||||
this.workerId = workerId;
|
||||
this.maxAssignmentsLimitPerBatch = maxAssignmentsLimitPerBatch;
|
||||
this.maxAssignmentsBatchesToHandleBeforeShutdown = maxAssignmentsBatchesToHandleBeforeShutdown;
|
||||
this.controllerBaseUrl = controllerBaseUrl;
|
||||
urlReports = new ArrayList<>(this.maxAssignmentsLimitPerBatch);
|
||||
int expectedAssignmentsPerDatasource = (this.maxAssignmentsLimitPerBatch / expectedDatasourcesPerRequest);
|
||||
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
|
||||
requestUrl = this.controllerBaseUrl + (askForTest ? "test/" : "") + "urls?workerId=" + this.workerId + "&workerAssignmentsLimit=" + this.maxAssignmentsLimitPerBatch;
|
||||
|
||||
if ( !workerReportsDirPath.endsWith("/") )
|
||||
workerReportsDirPath += "/";
|
||||
|
||||
this.workerReportsDirPath = workerReportsDirPath;
|
||||
try {
|
||||
Files.createDirectories(Paths.get(this.workerReportsDirPath)); // No-op if it already exists.
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Could not create the \"workerReportsDirPath\": " + this.workerReportsDirPath;
|
||||
logger.error(errorMsg, e);
|
||||
throw new RuntimeException(errorMsg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public AssignmentsRequest requestAssignments()
|
||||
{
|
||||
logger.info("Going to request up to " + this.maxAssignmentsLimitPerBatch + " assignments from the Controller: " + requestUrl);
|
||||
AssignmentsRequest assignmentRequest = null;
|
||||
try { // Here, the HTTP-request is executed.
|
||||
assignmentRequest = restTemplate.getForObject(requestUrl, AssignmentsRequest.class);
|
||||
} catch (RestClientException rce) {
|
||||
logger.error("Could not retrieve the assignments! " + rce.getMessage()); // The exMsg also shows the response body of the response (from Spring v.2.5.6 onwards).
|
||||
hadConnectionErrorOnRequest = true;
|
||||
return null;
|
||||
} catch (IllegalArgumentException iae) {
|
||||
logger.error("Could not retrieve the assignments, as the provided Controller's url was malformed! " + iae.getMessage());
|
||||
// We do not need to send a "ShutdownReport" to the Controller, since this error will appear upon the Worker's initialization and the Controller will not have any information about this Worker's existence.
|
||||
UrlsWorkerApplication.gentleAppShutdown();
|
||||
}
|
||||
//logger.debug(assignmentRequest.toString()); // DEBUG!
|
||||
return assignmentRequest;
|
||||
}
|
||||
|
||||
|
||||
public static boolean shouldNotRequestMore = false;
|
||||
|
||||
|
||||
public void handleAssignments()
|
||||
{
|
||||
AssignmentsRequest assignmentsRequest = requestAssignments();
|
||||
if ( assignmentsRequest == null )
|
||||
return;
|
||||
|
||||
Long assignmentRequestCounter = assignmentsRequest.getAssignmentsCounter();
|
||||
List<Assignment> assignments = assignmentsRequest.getAssignments();
|
||||
if ( assignments == null ) {
|
||||
if ( assignmentRequestCounter == -1 )
|
||||
logger.warn("The Controller could not retrieve and assignments from the database. It will increase the attempts-number and retry in the next request.");
|
||||
else
|
||||
logger.warn("The assignments were found to be null for assignmentRequestCounter = " + assignmentRequestCounter);
|
||||
return; // The Worker will just request the assignments again, immediately.
|
||||
}
|
||||
|
||||
int assignmentsSize = assignments.size();
|
||||
if ( assignmentsSize == 0 ) {
|
||||
logger.warn("The assignmentsSize was < 0 > for assignmentRequestCounter = " + assignmentRequestCounter);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("AssignmentRequest < " + assignmentRequestCounter + " > was received and it's ready to be processed. It contains " + assignmentsSize + " assignments.");
|
||||
|
||||
Instant startTime = Instant.now();
|
||||
|
||||
// Make sure there are no multiple occurrences of urls with the same domains are present, next to each other, inside the list.
|
||||
// If the same domains appear too close in the list, then this means we have large waiting-times between url-connections, due to "politeness-delays" to avoid server-overloading.
|
||||
|
||||
assignments = getAssignmentsSpacedOutByDomain(assignments, assignmentsSize, false);
|
||||
|
||||
// Iterate over the assignments and add each assignment in its own list depending on the DATASOURCE in order to decide which plugin to use later.
|
||||
for ( Assignment assignment : assignments ) {
|
||||
try {
|
||||
assignmentsForPlugins.put(assignment.getDatasource().getId(), assignment);
|
||||
} catch (NullPointerException npe) {
|
||||
logger.warn("An NPE was thrown when splitting the assignments based on the datasource-types. The problematic assignment was: " + assignment); // Do not use "assignment.toString()", it may cause an NPE.
|
||||
}
|
||||
}
|
||||
|
||||
//countDatasourcesAndRecords(assignmentsSize); // Only for DEBUG! Keep it commented in normal run.
|
||||
|
||||
// TODO - Decide which assignments should run with what plugin (depending on their datasource).
|
||||
// First run -in parallel- the assignments which require some specific plugin.
|
||||
// Then, after the above plugins are finished, run the remaining assignments in the generic plugin (which handles parallelism itself).
|
||||
// TODO - If we have more than one plugin running at the same time, then make the "AssignmentsHandler.urlReports"-list thread-safe.
|
||||
|
||||
// For now, let's just run all assignments in the generic plugin.
|
||||
try {
|
||||
publicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when processing the assignments_" + assignmentRequestCounter, e);
|
||||
return;
|
||||
} // In this case, no assignments were processed.
|
||||
|
||||
PublicationsRetriever.calculateAndPrintElapsedTime(startTime, Instant.now(), "The processing of assignments_" + assignmentRequestCounter + " (containing " + assignmentsSize + " assignments) finished after: ");
|
||||
|
||||
if ( askForTest ) {
|
||||
logger.debug("UrlReports:"); // DEBUG!
|
||||
for ( UrlReport urlReport : urlReports )
|
||||
logger.debug(urlReport.toString());
|
||||
} // Avoid posting the results in "askForTestUrls"-mode. We don't want for test-results to be written into the database by the controller.
|
||||
else
|
||||
postWorkerReport(assignmentRequestCounter);
|
||||
|
||||
// The "postWorkerReport()" above, may fail, but the numbers below still stand, as they are affected by the results themselves, rather than the "posting" of them to the Controller.
|
||||
|
||||
numHandledAssignmentsBatches ++; // This is used later to stop this app, if a user-defined upper limit is set and reached.
|
||||
|
||||
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
||||
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
||||
long idUrlPairsHandled = (numHandledAssignmentsBatches * maxAssignmentsLimitPerBatch);
|
||||
|
||||
if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) {
|
||||
UrlUtils.duplicateUrls.clear();
|
||||
timesClearingDuplicateUrlsData ++;
|
||||
}
|
||||
|
||||
if ( idUrlPairsHandled >= ((timesClearingDomainAndPathTrackingData +1) * idUrlsToHandleBeforeClearingDomainAndPathTrackingData) ) {
|
||||
GenericUtils.clearTrackingData(); // This includes the "blocking data", we may say "if this condition is true, do not bother checking the just-blocking condition".
|
||||
timesClearingDomainAndPathTrackingData ++;
|
||||
timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless.
|
||||
} else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) {
|
||||
GenericUtils.clearBlockingData();
|
||||
timesClearingDomainAndPathBlockingData ++;
|
||||
}
|
||||
|
||||
if ( GeneralController.shouldShutdownWorker
|
||||
|| (numHandledAssignmentsBatches == maxAssignmentsBatchesToHandleBeforeShutdown) )
|
||||
{
|
||||
logger.info("The worker will shutdown, after the full-texts are delivered to the Controller, as " + (GeneralController.shouldShutdownWorker
|
||||
? "it received a \"shutdownWorker\" request!"
|
||||
: "the maximum assignments-batches (" + maxAssignmentsBatchesToHandleBeforeShutdown + ") to be handled was reached!"));
|
||||
|
||||
// Here, just specify that we do not want to request for more assignments. A scheduling job will check if the fulltexts were delivered to the Controller and then shutdown the Worker.
|
||||
shouldNotRequestMore = true;
|
||||
}
|
||||
|
||||
// Note: Cannot call this method, here, retrospectively, as if it runs 100s of times, the memory-stack may break..
|
||||
// The scheduler will handle calling it repetitively, in case the Worker is available for work..
|
||||
}
|
||||
|
||||
|
||||
public static final Set<Long> handledAssignmentsCounters = Collections.newSetFromMap(new ConcurrentHashMap<Long, Boolean>());
|
||||
|
||||
|
||||
/**
|
||||
* Post the worker report and wait for the Controller to request the publication-files.
|
||||
* Once the Controller finishes with uploading the files to the S3-ObjectStore, it returns an "HTTP-200-OK" response to the Worker.
|
||||
* Afterwards, the Worker, even in case of an error, deletes the full-texts and the ".tar" and ".tar.zstd" files.
|
||||
* */
|
||||
public boolean postWorkerReport(Long assignmentRequestCounter)
|
||||
{
|
||||
String postUrl = this.controllerBaseUrl + "urls/addWorkerReport";
|
||||
logger.info("Going to post the WorkerReport of assignments_" + assignmentRequestCounter + " to the controller-server: " + postUrl);
|
||||
WorkerReport workerReport = new WorkerReport(this.workerId, assignmentRequestCounter, urlReports);
|
||||
|
||||
// Create the report file. It may be useful later, in case something goes wrong when sending the report to the Controller or the Controller cannot process it.
|
||||
// The report-file is deleted, along with the full-texts) when the Controller posts that the processing of this report was successful.
|
||||
writeToFile(this.workerReportsDirPath + this.workerId + "_assignments_" + assignmentRequestCounter + "_report.json", workerReport.getJsonReport(), false);
|
||||
|
||||
// The worker sends this "WorkerReport" to the Controller, which after some checks, it adds a job to a background thread and responds to the Worker with HTTP-200-OK.
|
||||
try {
|
||||
ResponseEntity<String> responseEntity = restTemplate.postForEntity(postUrl, workerReport, String.class);
|
||||
int responseCode = responseEntity.getStatusCodeValue();
|
||||
if ( responseCode == HttpStatus.OK.value() ) {
|
||||
logger.info("The submission of the WorkerReport of assignments_" + assignmentRequestCounter + " to the Controller, was successful.");
|
||||
handledAssignmentsCounters.add(assignmentRequestCounter);
|
||||
return true;
|
||||
} else { // This does not include HTTP-5XX errors. For them an "HttpServerErrorException" is thrown.
|
||||
logger.error("HTTP-Connection problem with the submission of the WorkerReport of assignments_" + assignmentRequestCounter + " to the Controller! Error-code was: " + responseCode);
|
||||
return false;
|
||||
}
|
||||
} catch (HttpServerErrorException hsee) {
|
||||
logger.error("The Controller failed to handle the WorkerReport of assignments_" + assignmentRequestCounter + ": " + hsee.getMessage());
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error when submitting the WorkerReport of assignments_" + assignmentRequestCounter + " to the Controller: ", e);
|
||||
return false;
|
||||
} finally {
|
||||
urlReports.clear(); // Reset, without de-allocating.
|
||||
assignmentsForPlugins.clear();
|
||||
// The full-text files will be deleted after being transferred to the Controller.
|
||||
}
|
||||
|
||||
// Note: It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them.
|
||||
// In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database.
|
||||
// When all the id-urls are processed at least one time, the Service will start reprocessing all the "couldRetry" records without a related "payload"-record.
|
||||
}
|
||||
|
||||
|
||||
public static List<Assignment> getAssignmentsSpacedOutByDomain(List<Assignment> assignments, int assignmentsSize, boolean shouldPrintDifference)
|
||||
{
|
||||
List<Assignment> spacedOutAssignments = new ArrayList<>(assignmentsSize);
|
||||
|
||||
// Check the order of urls' domain in the list. Same domain-urls should be far away from each other, to improve parallelism. (this should happen after the plugin-categorization)
|
||||
HashMultimap<String, Assignment> domainsWithAssignments = HashMultimap.create(assignmentsSize/3, 3);
|
||||
|
||||
StringBuilder sb = null;
|
||||
if ( shouldPrintDifference )
|
||||
sb = new StringBuilder(assignmentsSize * 20);
|
||||
|
||||
for ( Assignment assignment : assignments ) {
|
||||
if ( assignment != null ) {
|
||||
String url = assignment.getOriginalUrl();
|
||||
if ( url != null ) {
|
||||
String domain = UrlUtils.getDomainStr(url, null);
|
||||
if ( domain != null ) {
|
||||
domain = UrlUtils.getTopThreeLevelDomain(domain); // This does not return null, only the param itself, in case of an error.
|
||||
domainsWithAssignments.put(domain, assignment); // Each "domain" will have multiple assignments.
|
||||
if ( sb != null )
|
||||
sb.append(domain).append("\n"); // DEBUG!
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( sb != null ) {
|
||||
logger.debug("Before change:\n" + sb); // DEBUG!
|
||||
sb.setLength(0); // Reset it without re-sizing it.
|
||||
}
|
||||
|
||||
List<String> domains = new ArrayList<>(domainsWithAssignments.keySet());
|
||||
int domainsSize = domains.size();
|
||||
Integer domainsCounter = -1;
|
||||
|
||||
for ( int i = 0; i < assignmentsSize; ++i )
|
||||
{
|
||||
HashMap<Object, Integer> result = getFirstAvailableObjectForSpacedOutDomains(domains, domainsCounter, domainsWithAssignments, domainsSize, sb);
|
||||
if ( result == null ) { // Check whether the recursive method was left without data.
|
||||
logger.warn("the recursive method was asked to do more, using less data!");
|
||||
break;
|
||||
}
|
||||
Assignment nextAssignment = (Assignment) result.keySet().toArray()[0];
|
||||
domainsCounter = result.get(nextAssignment);
|
||||
spacedOutAssignments.add(nextAssignment);
|
||||
}
|
||||
|
||||
if ( sb != null )
|
||||
logger.debug("After change:\n" + sb);
|
||||
|
||||
return spacedOutAssignments;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method uses recursion to go through the "domainsWithAssignments" multimap and get the nextAssignment.
|
||||
* The recursion terminates when there is no more data for any domain.
|
||||
* This method may return null, in case it is called more time than the number of assignments all the domains hold inside "domainsWithAssignments".
|
||||
* */
|
||||
public static HashMap<Object, Integer> getFirstAvailableObjectForSpacedOutDomains(List<String> domainsList, Integer domainsCounter, HashMultimap<String, ?> domainsWithAssignments, int domainsSize, StringBuilder sb)
|
||||
{
|
||||
// Normally, this method does not need a recursion-break-safety, as the initial-caller method should call this method exactly N times, where N is the number of all the values of "domainsWithAssignments".
|
||||
// Although, for extra-safety and re-usability, let's have this check here.
|
||||
if ( domainsWithAssignments.keySet().isEmpty() )
|
||||
return null; // Break recursion when the domains run-out.
|
||||
|
||||
if ( domainsCounter < (domainsSize -1) )
|
||||
domainsCounter ++;
|
||||
else
|
||||
domainsCounter = 0; // Start over.
|
||||
|
||||
String currentDomain = domainsList.get(domainsCounter);
|
||||
Set<?> assignmentsOfCurrentDomain = domainsWithAssignments.get(currentDomain);
|
||||
if ( assignmentsOfCurrentDomain.isEmpty() ) // This domain is out of assignments, check the next available one.
|
||||
return getFirstAvailableObjectForSpacedOutDomains(domainsList, domainsCounter, domainsWithAssignments, domainsSize, sb);
|
||||
|
||||
Object nextAssignment = assignmentsOfCurrentDomain.toArray()[0];
|
||||
HashMap<Object, Integer> result = new HashMap<>();
|
||||
result.put(nextAssignment, domainsCounter);
|
||||
domainsWithAssignments.remove(currentDomain, nextAssignment);
|
||||
if ( sb != null )
|
||||
sb.append(currentDomain).append("\n"); // DEBUG!
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private static final Lock fileWriteLock = new ReentrantLock(true);
|
||||
|
||||
public String writeToFile(String fileFullPath, String stringToWrite, boolean shouldLockThreads)
|
||||
{
|
||||
if ( shouldLockThreads )
|
||||
fileWriteLock.lock();
|
||||
|
||||
try ( BufferedWriter bufferedWriter = new BufferedWriter(Files.newBufferedWriter(Paths.get(fileFullPath)), FilesCompressor.bufferSize) )
|
||||
{
|
||||
bufferedWriter.write(stringToWrite); // This will overwrite the file. If the new string is smaller, then it does not matter.
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Failed to create or acquire the file \"" + fileFullPath + "\"!";
|
||||
logger.error(errorMsg, e);
|
||||
return errorMsg;
|
||||
} finally {
|
||||
if ( shouldLockThreads )
|
||||
fileWriteLock.unlock();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public static void countDatasourcesAndRecords(int assignmentsSize)
|
||||
{
|
||||
Set<String> datasources = assignmentsForPlugins.keySet();
|
||||
int numDatasources = datasources.size();
|
||||
logger.debug("Num of datasources: " + numDatasources);
|
||||
for ( String datasource : datasources ) {
|
||||
logger.debug("Num of records for datasource \"" + datasource + "\" is: " + assignmentsForPlugins.get(datasource).size() );
|
||||
}
|
||||
logger.debug("Average num of records per datasource: " + (assignmentsSize / numDatasources));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.openaire.urls_worker.components;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.client.HttpServerErrorException;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
|
||||
@Component
|
||||
public class ConnWithController {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ConnWithController.class);
|
||||
|
||||
private final String controllerBaseUrl;
|
||||
|
||||
|
||||
public ConnWithController(@Value("${info.controllerBaseUrl}") String controllerBaseUrl) {
|
||||
this.controllerBaseUrl = controllerBaseUrl;
|
||||
}
|
||||
|
||||
|
||||
public boolean postShutdownReportToController(String workerId)
|
||||
{
|
||||
logger.info("Going to \"postShutdownReportToController\".");
|
||||
try {
|
||||
ResponseEntity<String> responseEntity = new RestTemplate().postForEntity(this.controllerBaseUrl + "workerShutdownReport?workerId=" + workerId, null, String.class);
|
||||
int responseCode = responseEntity.getStatusCodeValue();
|
||||
if ( responseCode != HttpStatus.OK.value() ) {
|
||||
logger.error("HTTP-Connection problem with the submission of the \"postShutdownReportToController\"! Error-code was: " + responseCode);
|
||||
return false;
|
||||
}
|
||||
} catch (HttpServerErrorException hsee) {
|
||||
logger.error("The Controller failed to handle the \"postShutdownReportToController\": " + hsee.getMessage());
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error for \"postShutdownReportToController\" to the Controller.", e);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -2,20 +2,19 @@ package eu.openaire.urls_worker.components;
|
|||
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
import eu.openaire.urls_worker.controllers.FullTextsController;
|
||||
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.util.AssignmentsHandler;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import eu.openaire.urls_worker.controllers.GeneralController;
|
||||
import eu.openaire.urls_worker.services.FileStorageService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
@Component
|
||||
|
@ -23,61 +22,193 @@ public class ScheduledTasks {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ScheduledTasks.class);
|
||||
|
||||
private static final SimpleDateFormat dateFormat = new SimpleDateFormat("HH:mm:ss");
|
||||
|
||||
//@Scheduled(fixedRate = 600_000) // Every 10 mins: 600_000
|
||||
public void reportCurrentTime() {
|
||||
logger.info("Server is live! Time is now {}", dateFormat.format(new Date()));
|
||||
}
|
||||
@Autowired
|
||||
AssignmentsHandler assignmentsHandler;
|
||||
|
||||
@Scheduled(fixedRate = 900_000) // Every 15 mins: 900_000
|
||||
public void handleNewAssignments() {
|
||||
if ( AssignmentsHandler.isAvailableForWork )
|
||||
AssignmentsHandler.handleAssignments();
|
||||
else {
|
||||
//logger.debug("The worker is not available for work at the moment.."); // JUST FOR DEBUG!
|
||||
}
|
||||
}
|
||||
@Autowired
|
||||
private FileStorageService fileStorageService;
|
||||
|
||||
@Autowired
|
||||
private ConnWithController connWithController;
|
||||
|
||||
@Value("${info.workerId}")
|
||||
private String workerId;
|
||||
|
||||
@Value("${workerReportsDirPath}")
|
||||
private String workerReportsDirPath;
|
||||
|
||||
private static final File rootPath = new File("/");
|
||||
private static final long oneAndHalfGB = 1_610_612_736L; // We need 1.5 GB free space per 1.000-assignments-batch.
|
||||
private static long requiredFreeSpace;
|
||||
|
||||
private static final int oneMb = (1024 * 1024);
|
||||
|
||||
|
||||
@Scheduled(fixedRate = 43_200_000) // Every 12 hours.
|
||||
public static void deleteHandledAssignmentsFullTexts()
|
||||
public ScheduledTasks(@Value("${info.maxAssignmentsLimitPerBatch}") int maxAssignmentsLimitPerBatch)
|
||||
{
|
||||
Set<Map.Entry<Long, Boolean>> entrySet = FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet();
|
||||
if ( entrySet.isEmpty() )
|
||||
if ( maxAssignmentsLimitPerBatch <= 1_000 )
|
||||
requiredFreeSpace = oneAndHalfGB;
|
||||
else
|
||||
requiredFreeSpace = oneAndHalfGB * (maxAssignmentsLimitPerBatch / 1_000);
|
||||
|
||||
logger.info("The \"requiredFreeSpace\" for the app to request new assignments, having \"maxAssignmentsLimitPerBatch\" equal to " + maxAssignmentsLimitPerBatch + " , is: " + (requiredFreeSpace / (1024 * 1024)) + " Mb");
|
||||
}
|
||||
|
||||
|
||||
@Scheduled(fixedDelay = 1) // Request the next batch immediately after the last one finishes.
|
||||
public void handleNewAssignments()
|
||||
{
|
||||
if ( GeneralController.shouldShutdownWorker || AssignmentsHandler.shouldNotRequestMore ) {
|
||||
// Here we will be right after the Worker has posted its last report. It is guaranteed that the Controller will not have processed it and have not requested the full-text files.
|
||||
// We do not want to shut down the Worker, until all files have been transferred to the Controller, or some time has passed.
|
||||
return;
|
||||
}
|
||||
|
||||
if ( rootPath.getFreeSpace() < requiredFreeSpace ) {
|
||||
// It's not safe to proceed with downloading more files and risk of "noSpaceLeft" error.
|
||||
// Wait for the Controller to take the full-texts and any remaining files to be deleted, so that more free-space becomes available.
|
||||
// We need to have some buffer zone for the ".tar" files which will be created from the already downloaded full-texts, when the Controller starts requesting them.
|
||||
logger.warn("The free space is running out (less than " + (requiredFreeSpace / oneMb) + " Mb). The Worker will avoid getting new assignments for the next 15 minutes.");
|
||||
try {
|
||||
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the disk-space to be freed.
|
||||
} catch (InterruptedException ie) {
|
||||
logger.warn("Sleeping was interrupted!");
|
||||
}
|
||||
return; // Cause this method to be called again, so that the Free-space can be checked again before proceeding with new assignments.
|
||||
}
|
||||
|
||||
if ( AssignmentsHandler.hadConnectionErrorOnRequest ) {
|
||||
try {
|
||||
Thread.sleep(900_000); // Sleep for 15 mins to stall the scheduler from retrying right away, thus giving time to the Controller to recover.
|
||||
} catch (InterruptedException ie) {
|
||||
logger.warn("Sleeping was interrupted!");
|
||||
} finally {
|
||||
AssignmentsHandler.hadConnectionErrorOnRequest = false;
|
||||
}
|
||||
}
|
||||
|
||||
assignmentsHandler.handleAssignments();
|
||||
}
|
||||
|
||||
|
||||
@Scheduled(initialDelay = 900_000, fixedDelay = 1_800_000) // InitialDelay = 15 mins, FixedDelay = 30 mins.
|
||||
//@Scheduled(initialDelay = 60_000, fixedDelay = 60_000) // Just for testing (every 60 secs).
|
||||
public void checkIfShouldShutdown()
|
||||
{
|
||||
if ( !GeneralController.shouldShutdownWorker && !AssignmentsHandler.shouldNotRequestMore )
|
||||
return;
|
||||
|
||||
logger.info("Going to delete the locally stored fullTexts.");
|
||||
// Check if the full-texts have been delivered to the Controller.
|
||||
// In case some files have been left behind due to an error. DO not shut down, but wait for the other scheduled task to clean the in the right time and then shutdown.
|
||||
|
||||
for ( Map.Entry<Long,Boolean> entry : entrySet )
|
||||
{
|
||||
if ( entry.getValue().equals(true) ) // It is already deleted, move on.
|
||||
continue;
|
||||
|
||||
Long curAssignments = entry.getKey();
|
||||
String currentAssignmentsBasePath = PublicationsRetrieverPlugin.assignmentsBasePath + "assignments_" + curAssignments + "_fullTexts" + File.separator;
|
||||
logger.debug("Going to delete the files from assignments: " + currentAssignmentsBasePath);
|
||||
|
||||
File curDir = new File(currentAssignmentsBasePath);
|
||||
if ( !curDir.isDirectory() ) {
|
||||
logger.error("This assignments-dir does not exist: " + currentAssignmentsBasePath);
|
||||
continue;
|
||||
File fullTextsBaseDir = new File(fileStorageService.assignmentsBaseLocation);
|
||||
if ( fullTextsBaseDir.isDirectory() ) {
|
||||
File[] fulltextSubDirs = fullTextsBaseDir.listFiles(File::isDirectory);
|
||||
if ( fulltextSubDirs == null ) {
|
||||
logger.error("There was an error when getting the subDirs of \"fullTextsBaseDir\": " + fullTextsBaseDir);
|
||||
return; // It's NOT safe to shut down.
|
||||
}
|
||||
if ( fulltextSubDirs.length > 0 ) {
|
||||
logger.warn("The base full-texts directory still has " + fulltextSubDirs.length + " sub-directories with full-texts, wait for the Controller to take all the files, or wait some time to past before they are deleted. Then the Worker will shut down.");
|
||||
// Some subDirs may be left behind due to some error when processing the WorkerReport. In that case,
|
||||
return;
|
||||
} else
|
||||
logger.debug("The \"fullTextsBaseDir\" is empty. Shutting down..");
|
||||
} else
|
||||
logger.warn("The base full-texts directory was not found! Shutting down.."); // This base-directory should exist during run-time, but we can proceed with shutting down the Service.
|
||||
|
||||
try {
|
||||
FileUtils.deleteDirectory(curDir);
|
||||
FullTextsController.assignmentsNumsHandledAndLocallyDeleted.put(curAssignments, true); // Set the is-handled to true.
|
||||
} catch (IOException e) {
|
||||
logger.error("The following directory could not be deleted: " + currentAssignmentsBasePath, e);
|
||||
}
|
||||
}
|
||||
connWithController.postShutdownReportToController(workerId);
|
||||
UrlsWorkerApplication.gentleAppShutdown();
|
||||
}
|
||||
|
||||
|
||||
//@Scheduled(fixedRate = 20_000) // Every 20 secs.
|
||||
public void testUrlConnection() {
|
||||
String urlToCheck = "https://zenodo.org/record/1145726";
|
||||
PublicationsRetrieverPlugin.connectWithUrlTest(urlToCheck);
|
||||
private static final Pattern ASSIGNMENTS_COUNTER = Pattern.compile(".*assignments_([\\d]+).*");
|
||||
|
||||
|
||||
private static final double hoursToWaitBeforeDeletion = 48.0;
|
||||
|
||||
private static final int hoursDivisor = (1000 * 60 * 60); // In order to get the time-difference in hours. We divide with: /1000 to get seconds, /60 to get minutes and /60 to get hours.
|
||||
|
||||
@Scheduled(initialDelay = 21_600_000, fixedDelay = 21_600_000) // InitialDelay & FixedDelay = 36 hours.
|
||||
//@Scheduled(initialDelay = 120_000, fixedDelay = 120_000) // Just for testing (every 2 mins).
|
||||
public void checkAndDeleteOldFiles() {
|
||||
// For any reason the Worker-report connection with the Controller may fail, but the Controller will continue requesting the full-text batches.
|
||||
|
||||
// Every X hours, check the last modification data of each "assignments_X_fulltexts" sub-directory.
|
||||
|
||||
// All sub-directories will have some files inside, as the duplicate files will not have been requested by the Controller, thus not been deleted after a batch.
|
||||
// Also, the last .zstd file will be inside.
|
||||
|
||||
// The way to know for which directory, we have a problem, is either by the amount of files or by the WorkerReport (in a separate directory).
|
||||
|
||||
// Even though we delete the full-texts batch-by-batch, some files may not have been previously deleted, since they may be duplicates of others found by previous assignments-batches
|
||||
// and thus, they may have not been requested by the Controller (and thus not deleted after transferring the batches).
|
||||
// Also, the ".tar.zstd" file of last batch will be deleted here, as well as the whole directory itself.
|
||||
|
||||
logger.debug("Going to check if any leftover full-texts exist and delete them.");
|
||||
int usableDirsNum = 0;
|
||||
try {
|
||||
File fullTextsBaseDir = new File(fileStorageService.assignmentsBaseLocation);
|
||||
if ( !fullTextsBaseDir.isDirectory() ) {
|
||||
logger.error("The \"fullTextsBaseDir\" (" + fileStorageService.assignmentsBaseLocation + ") does not exist!"); // This base dir should always exist during execution!
|
||||
return;
|
||||
}
|
||||
File[] fulltextSubDirs = fullTextsBaseDir.listFiles(File::isDirectory);
|
||||
if ( fulltextSubDirs == null ) {
|
||||
logger.error("There was an error when getting the subDirs of \"fullTextsBaseDir\": " + fullTextsBaseDir);
|
||||
return;
|
||||
}
|
||||
usableDirsNum = fulltextSubDirs.length;
|
||||
if ( usableDirsNum == 0 ) {
|
||||
logger.debug("The \"fullTextsBaseDir\" is empty, so there is nothing to delete.");
|
||||
return;
|
||||
}
|
||||
|
||||
long currentTime = System.currentTimeMillis();
|
||||
|
||||
// Loop through the array, check the "lastModified" time and if it is too old delete the full-texts subDir and the related workerReport.
|
||||
for ( File subDir : fulltextSubDirs ) {
|
||||
long lastModified = subDir.lastModified();
|
||||
|
||||
if ( logger.isTraceEnabled() )
|
||||
logger.trace("The subDir \"" + subDir.getName() + "\" was last accessed in: " + new Date(lastModified));
|
||||
|
||||
// Get the difference in hours. /1000 to get seconds, /60 to get minutes and /60 to get hours.
|
||||
double elapsedHours = (double) (currentTime - lastModified) / hoursDivisor;
|
||||
if ( elapsedHours > hoursToWaitBeforeDeletion ) {
|
||||
// Enough time has passed, the directory should be deleted immediately.
|
||||
String subDirName = subDir.getName();
|
||||
logger.warn("The subDir \"" + subDirName + "\" was accessed " + elapsedHours + " hours ago (passed the " + hoursToWaitBeforeDeletion + " hours limit) and will be deleted, along with the related WorkerReport.");
|
||||
FullTextsController.deleteDirectory(subDir);
|
||||
|
||||
// Extract the "assignmentsCounter" from subDir's name, in order to delete the right report file.
|
||||
Matcher matcher = ASSIGNMENTS_COUNTER.matcher(subDirName);
|
||||
if ( matcher.matches() ) {
|
||||
String assingmentsCounterString = matcher.group(1);
|
||||
if ( (assingmentsCounterString != null) && !assingmentsCounterString.isEmpty()) {
|
||||
if ( FullTextsController.deleteFile(this.workerReportsDirPath + this.workerId + "_assignments_" + assingmentsCounterString + "_report.json") )
|
||||
logger.warn("The subDir \"" + subDirName + "\" probably contains some failed files, since the workerReport for assignments_" + assingmentsCounterString + " was deleted only now, which means the Controller failed to successfully process the results of those assignments.");
|
||||
} else
|
||||
logger.error("The subDir \"" + subDirName + "\" has an invalid name! It does not contains the assignmentsCounter!");
|
||||
} else
|
||||
logger.error("The subDir \"" + subDirName + "\" has an invalid name! It could not be matched with regex: " + ASSIGNMENTS_COUNTER);
|
||||
usableDirsNum --; // Reduce the usableDirsNum even if some directories failed to be deleted, since the failed-dirs are not usable anyway.
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("", e);
|
||||
return;
|
||||
}
|
||||
|
||||
// After the cleanup of the remaining files, make sure we shutdown the Worker if it is desired.
|
||||
// Do this here, instead of waiting further, for the "checkIfShouldShutdown()" method to be called and shut it down.
|
||||
|
||||
if ( (GeneralController.shouldShutdownWorker || AssignmentsHandler.shouldNotRequestMore)
|
||||
&& (usableDirsNum == 0) ) { // Shutdown only if there are no "usable" directories left.
|
||||
connWithController.postShutdownReportToController(workerId);
|
||||
UrlsWorkerApplication.gentleAppShutdown();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,299 @@
|
|||
package eu.openaire.urls_worker.components.plugins;
|
||||
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
|
||||
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
||||
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||
import eu.openaire.urls_worker.components.AssignmentsHandler;
|
||||
import eu.openaire.urls_worker.controllers.GeneralController;
|
||||
import eu.openaire.urls_worker.models.Assignment;
|
||||
import eu.openaire.urls_worker.models.Error;
|
||||
import eu.openaire.urls_worker.models.Payload;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
import eu.openaire.urls_worker.services.FileStorageService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.CookieStore;
|
||||
import java.sql.Timestamp;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
|
||||
@Component
|
||||
public class PublicationsRetrieverPlugin {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
|
||||
|
||||
public static String assignmentsBasePath;
|
||||
|
||||
private static CookieStore cookieStore = null;
|
||||
|
||||
|
||||
public PublicationsRetrieverPlugin(@Value("${info.maxAssignmentsLimitPerBatch}") int maxAssignmentsLimitPerBatch, FileStorageService fileStorageService) {
|
||||
// Specify some configurations
|
||||
LoaderAndChecker.retrieveDocuments = true;
|
||||
LoaderAndChecker.retrieveDatasets = false;
|
||||
ConnSupportUtils.setKnownMimeTypes();
|
||||
FileUtils.shouldDownloadDocFiles = true;
|
||||
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
|
||||
PublicationsRetriever.targetUrlType = "docUrl";
|
||||
FileUtils.jsonBatchSize = maxAssignmentsLimitPerBatch;
|
||||
|
||||
assignmentsBasePath = fileStorageService.assignmentsBaseLocation;
|
||||
|
||||
ConnSupportUtils.shouldBlockMost5XXDomains = false; // If this is "true", all but the "503" will be blocked. Otherwise, only the "511" will be blocked.
|
||||
LoaderAndChecker.setCouldRetryRegex();
|
||||
|
||||
cookieStore = HttpConnUtils.cookieManager.getCookieStore();
|
||||
|
||||
int availableProcessors = Runtime.getRuntime().availableProcessors();
|
||||
if ( availableProcessors <= 4 )
|
||||
PublicationsRetriever.threadsMultiplier = 10;
|
||||
else
|
||||
PublicationsRetriever.threadsMultiplier = 6;
|
||||
|
||||
int workerThreadsCount = (availableProcessors * PublicationsRetriever.threadsMultiplier);
|
||||
logger.info("Use " + workerThreadsCount + " worker-threads.");
|
||||
PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
|
||||
}
|
||||
|
||||
|
||||
private static final List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);
|
||||
|
||||
public void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException
|
||||
{
|
||||
// At this point, the "assignmentsBasePath"-directory has already been successfully created.
|
||||
|
||||
String currentAssignmentsSubDir = "assignments_" + assignmentRequestCounter + "_fullTexts" + File.separator;
|
||||
FileUtils.storeDocFilesDir = assignmentsBasePath + currentAssignmentsSubDir; // It needs the last separator, because of how the docFiles are named and stored.
|
||||
|
||||
File curAssignmentsDirs = new File(FileUtils.storeDocFilesDir);
|
||||
try {
|
||||
if ( !curAssignmentsDirs.exists() ) {
|
||||
if ( !curAssignmentsDirs.mkdirs() ) // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result is the same: "false".
|
||||
throw new RuntimeException("Could not create the \"" + currentAssignmentsSubDir + "\" directories: \"" + FileUtils.storeDocFilesDir + "\"!");
|
||||
} else
|
||||
logger.warn("The curAssignmentsDirs: \"" + currentAssignmentsSubDir + "\" already exist! Probably left behind by a previous execution..");
|
||||
} catch (Exception e) { // Mainly a SecurityException.
|
||||
throw new RuntimeException("Failed to create the full-texts directory for assignments_" + assignmentRequestCounter + ": " + e.getMessage());
|
||||
}
|
||||
|
||||
final int[] urlsCounter = {0};
|
||||
int numOfAssignments = assignments.size();
|
||||
|
||||
// Start loading and checking urls.
|
||||
for ( Assignment assignment : assignments )
|
||||
{
|
||||
callableTasks.add(() -> {
|
||||
urlsCounter[0]++;
|
||||
if ( (urlsCounter[0] % 250) == 0 ) // Every 250 urls, display a "progress" message.
|
||||
logger.debug("Assignments_" + assignmentRequestCounter + " progress: will process url-" + urlsCounter[0] + " out of " + numOfAssignments);
|
||||
|
||||
String id = assignment.getId();
|
||||
String url = assignment.getOriginalUrl();
|
||||
|
||||
if ( (id == null) || id.isEmpty() || (url == null) || url.isEmpty() ) {
|
||||
String errorMsg = "Got null or empty pair! ID=" + id + " , url=" + url;
|
||||
logger.warn(errorMsg);
|
||||
UrlUtils.logOutputData(id, url, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null )
|
||||
return false;
|
||||
|
||||
String urlToCheck = url; // The "url" might have changed (inside "handleUrlChecks()").
|
||||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(sourceUrl)) == null ) {
|
||||
logger.warn("Could not normalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) { // If we got into an already-found docUrl, log it and return.
|
||||
ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isPossibleDocOrDatasetUrl = false; // Used for specific connection settings.
|
||||
String lowerCaseRetrievedUrl = url.toLowerCase();
|
||||
// Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.
|
||||
if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())
|
||||
|| (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {
|
||||
//logger.debug("Possible docUrl or datasetUrl: " + url);
|
||||
isPossibleDocOrDatasetUrl = true;
|
||||
}
|
||||
|
||||
try { // Check if it's a docUrl, if not, it gets crawled.
|
||||
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
|
||||
} catch (Exception e) {
|
||||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e, urlToCheck);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
String errorMsg = "Discarded at loading time, as " + list.get(2);
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
int numFailedTasks = LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
|
||||
if ( numFailedTasks == -1 ) { // The unknown exception is logged inside the above method.
|
||||
GeneralController.shouldShutdownWorker = true;
|
||||
AssignmentsHandler.shouldNotRequestMore = true;
|
||||
PublicationsRetriever.executor.shutdownNow(); // Close the thread-pool immediately. It will not be used again while the Worker is still running.
|
||||
throw new RuntimeException("Invoking and/or executing the callableTasks failed with the exception (which is written in the log files)!");
|
||||
}
|
||||
|
||||
if ( numFailedTasks > 0 )
|
||||
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
||||
|
||||
addUrlReportsToWorkerReport(assignments);
|
||||
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
|
||||
|
||||
ConnSupportUtils.domainsWithConnectionData.clear(); // This data is not useful for the next batch, since plenty of time will have passed before needing to check the "lastConnectedTime" for each domain, in order to apply the "politenessDelay".
|
||||
|
||||
//logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); // debug!
|
||||
boolean cookiesDeleted = cookieStore.removeAll();
|
||||
//logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); // DEBUG!
|
||||
}
|
||||
|
||||
|
||||
private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
|
||||
private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length();
|
||||
|
||||
private static final String provenance = "crawl:PublicationsRetriever";
|
||||
|
||||
|
||||
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
||||
{
|
||||
if ( FileUtils.dataToBeLoggedList.size() != assignments.size() ) {
|
||||
logger.warn("The number of the results (" + FileUtils.dataToBeLoggedList.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!");
|
||||
} // TODO - Should any other step be taken, except from just showing the log-message?
|
||||
|
||||
// Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataToBeLogged" objects.
|
||||
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
|
||||
for ( Assignment assignment : assignments )
|
||||
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
|
||||
|
||||
int numOfUnretrievedFiles = 0;
|
||||
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
||||
|
||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||
{
|
||||
// TODO - Consider adding multi-thread execution for the following code.
|
||||
// In that case, use "ConcurrentHashMap".
|
||||
|
||||
UrlReport.StatusType status = null;
|
||||
String fileLocation = null, comment = data.getComment(), mimeType = null, hash = data.getHash();
|
||||
Long size = data.getSize();
|
||||
Error error = null;
|
||||
|
||||
if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE.
|
||||
{
|
||||
status = UrlReport.StatusType.accessible;
|
||||
if ( comment.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) ) { // If this is not the initially-found docUrl record, go search for the initial.
|
||||
// The file of this docUrl was already downloaded by another docUrl.
|
||||
int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
|
||||
int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage;
|
||||
String initialId = comment.substring(lengthOfAlreadyDownloadedFromIDMessage, indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message".
|
||||
String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl);
|
||||
//logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl); // DEBUG!
|
||||
// Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
|
||||
boolean foundAlreadyDownloadedFullText = false;
|
||||
boolean foundIDUrlInWorkerReport = false;
|
||||
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList )
|
||||
{
|
||||
if ( ! (data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))) )
|
||||
continue;
|
||||
|
||||
// At this point we have found a record which has the same id and sourceUrl as the inspected record.
|
||||
foundIDUrlInWorkerReport = true;
|
||||
|
||||
if ( "false".equals(data_2.getWasDocumentOrDatasetAccessible()) )
|
||||
continue;
|
||||
|
||||
// At this point we have excluded any non-docUrl record, even if it has the same id and sourceUrl.
|
||||
// It is possible, that the same sourceUrl at one time it gives the docUrl and at another it does not, due to some kind of error.
|
||||
// So, we do not want to accept a record-instance which does not lead to any file, even if another instance of the same record did lead to a file.
|
||||
|
||||
String tempFileLocation = data_2.getComment();
|
||||
if ( tempFileLocation.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) || tempFileLocation.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) )
|
||||
continue;
|
||||
|
||||
// At this point we have found that another instance of the same record gives the docFile itself, not a reference to it.
|
||||
fileLocation = tempFileLocation;
|
||||
size = data_2.getSize();
|
||||
hash = data_2.getHash();
|
||||
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned to the value provided by the plugin (it has to be added in the future).
|
||||
foundAlreadyDownloadedFullText = true;
|
||||
break;
|
||||
}
|
||||
// In case the "alreadyDownloaded" full-text is not found, we have an error. All file-related data is "null".
|
||||
if ( !foundAlreadyDownloadedFullText ) {
|
||||
String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!");
|
||||
error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future.
|
||||
// The "fileLocation" is null.
|
||||
}
|
||||
}
|
||||
else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error.
|
||||
fileLocation = comment; // This is the full-file-path.
|
||||
mimeType = "application/pdf";
|
||||
} else { // Else the file was not retrieved, so all file-related data are kept "null".
|
||||
numOfUnretrievedFiles ++;
|
||||
error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future.
|
||||
}
|
||||
|
||||
if ( error == null ) // If the file was retrieved, in any time.
|
||||
error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" Error-object, since it just adds more complicated handling in the Controller..
|
||||
}
|
||||
else {
|
||||
status = UrlReport.StatusType.non_accessible;
|
||||
if ( "true".equals(data.getCouldRetry()) )
|
||||
error = new Error(Error.ErrorType.couldRetry, comment);
|
||||
else
|
||||
error = new Error(Error.ErrorType.noRetry, comment);
|
||||
}
|
||||
|
||||
String docOrDatasetUrl = data.getDocOrDatasetUrl();
|
||||
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
|
||||
docOrDatasetUrl = null;
|
||||
|
||||
// Convert "null" strings to actual < null >
|
||||
if ( (hash != null) && (hash.equals("null")) )
|
||||
hash = null;
|
||||
|
||||
String urlId = data.getUrlId();
|
||||
String datasourceId = urlIdsWithDatasourceIds.get(urlId);
|
||||
|
||||
// Each record will have the urlID, the datasourceID and possibly one filename, which may contain a different urlID.
|
||||
// The Controller will select the correct datasourceID for before adding it inside the S3-ObjectStore filename.
|
||||
|
||||
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, provenance, datasourceId);
|
||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
|
||||
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||
}// end-for
|
||||
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
|
||||
|
||||
if ( numOfUnretrievedFiles > 50 )
|
||||
logger.warn("The number of non-retrieved files is: " + numOfUnretrievedFiles);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,9 +1,13 @@
|
|||
package eu.openaire.urls_worker.controllers;
|
||||
|
||||
import eu.openaire.urls_worker.components.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.services.FileStorageService;
|
||||
import eu.openaire.urls_worker.util.FilesZipper;
|
||||
import eu.openaire.urls_worker.util.FilesCompressor;
|
||||
import org.apache.commons.io.FileDeleteStrategy;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.MediaType;
|
||||
|
@ -13,9 +17,11 @@ import org.springframework.web.bind.annotation.PathVariable;
|
|||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.HashMap;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
@RestController
|
||||
|
@ -24,30 +30,35 @@ public class FullTextsController {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeneralController.class);
|
||||
|
||||
public static HashMap<Long, Boolean> assignmentsNumsHandledAndLocallyDeleted = new HashMap<>();
|
||||
|
||||
public static String assignmentsBaseDir = null;
|
||||
@Autowired
|
||||
private FileStorageService fileStorageService;
|
||||
|
||||
|
||||
public FullTextsController() {
|
||||
assignmentsBaseDir = FileStorageService.assignmentsLocation.toString() + File.separator;
|
||||
}
|
||||
|
||||
|
||||
@GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalZipBatches:[\\d]+}/{zipBatchCounter:[\\d]+}/{fileNamesWithExtensions}")
|
||||
public Object getMultipleFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalZipBatches, @PathVariable int zipBatchCounter, @PathVariable List<String> fileNamesWithExtensions) {
|
||||
|
||||
@GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalBatches:[\\d]+}/{batchCounter:[\\d]+}/{fileNamesWithExtensions}")
|
||||
public Object getFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalBatches, @PathVariable int batchCounter, @PathVariable List<String> fileNamesWithExtensions)
|
||||
{
|
||||
int fileNamesListNum = fileNamesWithExtensions.size();
|
||||
if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).length() == 0) ) { // In case the last "/" in the url was given, then this list will not be empty, but have one empty item instead.
|
||||
if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).isEmpty()) ) { // In case the last "/" in the url was given (without any files following), then this list will not be empty, but have one empty item instead.
|
||||
// In case the url does not end in "/", then Spring will automatically return an "HTTP-BadRequest".
|
||||
String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter;
|
||||
logger.warn(errorMsg);
|
||||
String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + batchCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter + " (out of " + totalZipBatches + ").");
|
||||
if ( totalBatches == 0 ) {
|
||||
String errorMsg = "The given \"totalBatches\" (" + totalBatches + ") was < 0 >!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
else if ( batchCounter > totalBatches ) {
|
||||
String errorMsg = "The given \"batchCounter\" (" + batchCounter + ") is greater than the \"totalBatches\" (" + totalBatches + ")!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
String currentAssignmentsBaseFullTextsPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator;
|
||||
logger.info("Received a \"getFullTexts\" request for returning a \".tar.zstd\" file, containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + batchCounter + " (out of " + totalBatches + ").");
|
||||
|
||||
String currentAssignmentsBaseFullTextsPath = fileStorageService.assignmentsBaseLocation + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator;
|
||||
|
||||
if ( ! (new File(currentAssignmentsBaseFullTextsPath).isDirectory()) ) {
|
||||
String errorMsg = "The base directory for assignments_" + assignmentsCounter + " was not found: " + currentAssignmentsBaseFullTextsPath;
|
||||
|
@ -55,36 +66,34 @@ public class FullTextsController {
|
|||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
if ( zipBatchCounter > totalZipBatches ) {
|
||||
String errorMsg = "The given \"zipBatchCounter\" (" + zipBatchCounter + ") is greater than the \"totalZipBatches\" (" + totalZipBatches + ")!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
File zipFile = FilesZipper.zipMultipleFilesAndGetZip(assignmentsCounter, zipBatchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath);
|
||||
if ( zipFile == null ) {
|
||||
String errorMsg = "Failed to create the zip file for \"zipBatchCounter\"-" + zipBatchCounter;
|
||||
File zstdFile = FilesCompressor.compressMultipleFilesIntoOne(assignmentsCounter, batchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath);
|
||||
if ( zstdFile == null ) {
|
||||
// The failed files (including the ".tar"), have already been deleted.
|
||||
String errorMsg = "Failed to create the zstd file for \"batchCounter\"-" + batchCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
}
|
||||
|
||||
// If this is the last batch for this assignments-count, then make sure it is deleted in the next scheduled delete-operation.
|
||||
if ( zipBatchCounter == totalZipBatches ) {
|
||||
assignmentsNumsHandledAndLocallyDeleted.put(assignmentsCounter, false);
|
||||
logger.debug("Will return the last batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller and these assignments will be deleted later.");
|
||||
}
|
||||
if ( batchCounter == totalBatches )
|
||||
logger.debug("Will return the " + ((totalBatches > 1) ? "last" : "only one") + " batch (" + batchCounter + ") of assignments_" + assignmentsCounter + " to the Controller.");
|
||||
|
||||
String zipName = zipFile.getName();
|
||||
String zipFileFullPath = currentAssignmentsBaseFullTextsPath + zipName;
|
||||
String zstdName = zstdFile.getName();
|
||||
String zstdTarFileFullPath = currentAssignmentsBaseFullTextsPath + zstdName;
|
||||
try {
|
||||
return ResponseEntity.ok()
|
||||
.contentType(MediaType.APPLICATION_OCTET_STREAM)
|
||||
.header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + zipName + "\"")
|
||||
.body(new InputStreamResource(new FileInputStream(zipFileFullPath)));
|
||||
.header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + zstdName + "\"")
|
||||
.body(new InputStreamResource(new BufferedInputStream(Files.newInputStream(Paths.get(zstdTarFileFullPath)), FilesCompressor.bufferSize)));
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Could not load the FileInputStream of the zip-file \"" + zipFileFullPath + "\"!";
|
||||
String errorMsg = "Could not load the FileInputStream of the zstd-tar-file \"" + zstdTarFileFullPath + "\"!";
|
||||
logger.error(errorMsg, e);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
} finally {
|
||||
// The ".tar.zstd" file of this batch, for which we pass a steam to the Controller, will be deleted by the next batch or in the end of these assignments.
|
||||
// Now we will delete the zstd file of the previous assignments.
|
||||
int previousBatchCounter = (batchCounter -1);
|
||||
if ( previousBatchCounter >= 1 )
|
||||
deleteFile(currentAssignmentsBaseFullTextsPath + "assignments_" + assignmentsCounter + "_full-texts_" + previousBatchCounter + ".tar.zstd");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -93,7 +102,7 @@ public class FullTextsController {
|
|||
public ResponseEntity<?> getFullText(@PathVariable long assignmentsCounter, @PathVariable String fileNameWithExtension) {
|
||||
|
||||
logger.info("Received a \"getFullText\" request.");
|
||||
String fullTextFileFullPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator + fileNameWithExtension;
|
||||
String fullTextFileFullPath = fileStorageService.assignmentsBaseLocation + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator + fileNameWithExtension;
|
||||
File file = new File(fullTextFileFullPath);
|
||||
if ( !file.isFile() ) {
|
||||
logger.error("The file \"" + fullTextFileFullPath + "\" does not exist!");
|
||||
|
@ -104,7 +113,7 @@ public class FullTextsController {
|
|||
return ResponseEntity.ok()
|
||||
.contentType(MediaType.APPLICATION_OCTET_STREAM)
|
||||
.header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + file.getName() + "\"")
|
||||
.body(new InputStreamResource(new FileInputStream(fullTextFileFullPath)));
|
||||
.body(new InputStreamResource(new BufferedInputStream(Files.newInputStream(Paths.get(fullTextFileFullPath)), FilesCompressor.bufferSize)));
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Could not load the FileInputStream of the full-text-file \"" + fullTextFileFullPath + "\"!";
|
||||
logger.error(errorMsg, e);
|
||||
|
@ -112,4 +121,46 @@ public class FullTextsController {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean deleteAssignmentsDirectory(long curAssignments, File dir)
|
||||
{
|
||||
if ( dir == null ) {
|
||||
String directoryPath = PublicationsRetrieverPlugin.assignmentsBasePath;
|
||||
if ( curAssignments != -1 ) {
|
||||
directoryPath += "assignments_" + curAssignments + "_fullTexts";
|
||||
logger.debug("Going to delete the files inside the directory of assignments_" + curAssignments);
|
||||
} else
|
||||
logger.debug("Going to delete the parent assignments directory: " + directoryPath);
|
||||
dir = new File(directoryPath);
|
||||
}
|
||||
return deleteDirectory(dir);
|
||||
}
|
||||
|
||||
|
||||
public static boolean deleteDirectory(File directory)
|
||||
{
|
||||
try {
|
||||
FileUtils.deleteDirectory(directory);
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
logger.error("The following directory could not be deleted: " + directory.getPath(), e);
|
||||
return false;
|
||||
} catch (IllegalArgumentException iae) {
|
||||
logger.error("This directory does not exist: " + directory.getPath());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean deleteFile(String fileFullPathString)
|
||||
{
|
||||
try {
|
||||
FileDeleteStrategy.FORCE.delete(new File(fileFullPathString));
|
||||
} catch (IOException e) {
|
||||
logger.error("Error when deleting the file: " + fileFullPathString);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
package eu.openaire.urls_worker.controllers;
|
||||
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
import eu.openaire.urls_worker.payloads.responces.WorkerResponse;
|
||||
import eu.openaire.urls_worker.util.AssignmentsHandler;
|
||||
import eu.openaire.urls_worker.components.AssignmentsHandler;
|
||||
import eu.openaire.urls_worker.components.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.util.UriBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.File;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
@RestController
|
||||
|
@ -22,43 +23,131 @@ public class GeneralController {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeneralController.class);
|
||||
|
||||
@Autowired
|
||||
AssignmentsHandler assignmentsHandler;
|
||||
|
||||
private final String controllerIp;
|
||||
private final String workerReportsDirPath;
|
||||
private final String workerId;
|
||||
|
||||
|
||||
private static final Pattern DOMAIN_DETECTOR = Pattern.compile("^.*[a-zA-Z].*$");
|
||||
|
||||
public GeneralController(@Value("${info.controllerIp}") String controllerIp, @Value("${workerReportsDirPath}") String workerReportsDirPath, @Value("${info.workerId}") String workerId)
|
||||
{
|
||||
if ( DOMAIN_DETECTOR.matcher(controllerIp).matches() ) {
|
||||
try {
|
||||
this.controllerIp = java.net.InetAddress.getByName(controllerIp).getHostAddress();
|
||||
} catch (UnknownHostException uhe) {
|
||||
String errorMsg = "The domain given for the Controller (" + controllerIp + ") is unknown to the world! So its IP cannot be retrieved!";
|
||||
logger.error(errorMsg);
|
||||
throw new RuntimeException(errorMsg);
|
||||
}
|
||||
} else
|
||||
this.controllerIp = controllerIp;
|
||||
|
||||
this.workerReportsDirPath = workerReportsDirPath;
|
||||
this.workerId = workerId;
|
||||
}
|
||||
|
||||
public GeneralController() {}
|
||||
|
||||
@GetMapping("isAlive")
|
||||
public ResponseEntity<?> isWorkerAlive() {
|
||||
|
||||
logger.info("Received an \"isAlive\" request.");
|
||||
|
||||
return ResponseEntity.ok().build();
|
||||
}
|
||||
|
||||
|
||||
@GetMapping("isAvailableForWork")
|
||||
public ResponseEntity<?> isWorkerAvailableForWork() {
|
||||
public static boolean shouldShutdownWorker = false;
|
||||
|
||||
logger.info("Received an \"isWorkerAvailableForWork\" request.");
|
||||
@PostMapping("shutdownWorker")
|
||||
public ResponseEntity<?> shutdownWorkerGracefully(HttpServletRequest request)
|
||||
{
|
||||
String initMsg = "Received a \"shutdownWorker\" request. ";
|
||||
ResponseEntity<?> responseEntity = passSecurityChecks(request, initMsg);
|
||||
if ( responseEntity != null )
|
||||
return responseEntity;
|
||||
|
||||
if ( AssignmentsHandler.isAvailableForWork ) {
|
||||
logger.info("The worker is available for an assignment.");
|
||||
return ResponseEntity.status(200).body(new WorkerResponse(UrlsWorkerApplication.workerId, UrlsWorkerApplication.maxAssignmentsLimitPerBatch));
|
||||
} else {
|
||||
logger.info("The worker is busy with another assignment.");
|
||||
return ResponseEntity.status(HttpStatus.SERVICE_UNAVAILABLE).build();
|
||||
String finalMsg = "";
|
||||
if ( shouldShutdownWorker )
|
||||
finalMsg = "The worker has already received a \"shutdownWorker\" request (which was not canceled afterwards). ";
|
||||
else {
|
||||
shouldShutdownWorker = true;
|
||||
AssignmentsHandler.shouldNotRequestMore = true;
|
||||
}
|
||||
|
||||
finalMsg += "The worker will shutdown, after finishing current work.";
|
||||
logger.info(initMsg + finalMsg);
|
||||
return ResponseEntity.ok().body(finalMsg + "\n");
|
||||
}
|
||||
|
||||
|
||||
@PostMapping("cancelShutdownWorker")
|
||||
public ResponseEntity<?> cancelShutdownWorkerGracefully(HttpServletRequest request)
|
||||
{
|
||||
String initMsg = "Received a \"cancelShutdownWorker\" request. ";
|
||||
ResponseEntity<?> responseEntity = passSecurityChecks(request, initMsg);
|
||||
if ( responseEntity != null )
|
||||
return responseEntity;
|
||||
|
||||
shouldShutdownWorker = false;
|
||||
if ( AssignmentsHandler.numHandledAssignmentsBatches < assignmentsHandler.maxAssignmentsBatchesToHandleBeforeShutdown )
|
||||
AssignmentsHandler.shouldNotRequestMore = false; // Make sure the worker shuts-down, in case the user sends the relevant request, while the worker is stuck in a data-request error-loop.
|
||||
|
||||
String finalMsg = "Any previous \"shutdownWorker\"-request is canceled. The \"maxAssignmentsBatchesToHandleBeforeShutdown\" will still be honored (if it's set).";
|
||||
logger.info(initMsg + finalMsg);
|
||||
return ResponseEntity.ok().body(finalMsg + "\n");
|
||||
}
|
||||
|
||||
|
||||
@GetMapping("getHandledAssignmentsCounts")
|
||||
public ResponseEntity<?> getHandledAssignmentsCounts()
|
||||
{
|
||||
List<Long> handledAssignmentsCounts = new ArrayList<>(FullTextsController.assignmentsNumsHandledAndLocallyDeleted.size()/2);
|
||||
for ( Map.Entry<Long,Boolean> entry : FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet() )
|
||||
{
|
||||
if ( entry.getValue().equals(true) )
|
||||
handledAssignmentsCounts.add(entry.getKey());
|
||||
return ResponseEntity.ok(AssignmentsHandler.handledAssignmentsCounters);
|
||||
}
|
||||
|
||||
|
||||
@PostMapping("addReportResultToWorker/{assignmentsCounter}")
|
||||
public ResponseEntity<?> addReportResultToWorker(@PathVariable long assignmentsCounter, @RequestBody(required=false) String errorMsg)
|
||||
{
|
||||
if ( ! AssignmentsHandler.handledAssignmentsCounters.contains(assignmentsCounter) ) {
|
||||
errorMsg = "The \"addReportResultToWorker\"-endpoint was called for an unknown \"assignmentsCounter\": " + assignmentsCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.NOT_FOUND).body(errorMsg);
|
||||
}
|
||||
return ResponseEntity.ok(handledAssignmentsCounts);
|
||||
|
||||
if ( errorMsg == null ) {
|
||||
logger.info("The Controller successfully handled the WorkerReport, for assignments_" + assignmentsCounter + ". The worker-report and all full-text files associated with it, will be deleted.");
|
||||
String directoryPath = PublicationsRetrieverPlugin.assignmentsBasePath + "assignments_" + assignmentsCounter + "_fullTexts";
|
||||
File dir = new File(directoryPath);
|
||||
if ( dir.isDirectory() )
|
||||
FullTextsController.deleteAssignmentsDirectory(assignmentsCounter, dir);
|
||||
else
|
||||
logger.warn("The full-texts directory \"" + directoryPath + "\" has already been deleted by the scheduler.");
|
||||
|
||||
FullTextsController.deleteFile(this.workerReportsDirPath + this.workerId + "_assignments_" + assignmentsCounter + "_report.json");
|
||||
} else
|
||||
logger.error("The Controller failed to handle the WorkerReport, for assignments_" + assignmentsCounter + ". The error is:\n" + errorMsg);
|
||||
|
||||
return ResponseEntity.ok().build();
|
||||
}
|
||||
|
||||
|
||||
public ResponseEntity<?> passSecurityChecks(HttpServletRequest request, String initMsg)
|
||||
{
|
||||
if ( request == null ) {
|
||||
logger.error(initMsg + "The \"HttpServletRequest\" is null!");
|
||||
return ResponseEntity.internalServerError().build();
|
||||
}
|
||||
String remoteAddr = request.getHeader("X-FORWARDED-FOR"); // This retrieves the original IP address, if the request passes through a proxy server.
|
||||
if ( remoteAddr == null )
|
||||
remoteAddr = request.getRemoteAddr();
|
||||
|
||||
if ( ! (remoteAddr.equals("127.0.0.1") || remoteAddr.equals(UriBuilder.ip) || remoteAddr.equals(controllerIp)) ) {
|
||||
logger.error(initMsg + "The request came from another IP: " + remoteAddr + " | while this worker has the IP: " + UriBuilder.ip);
|
||||
return ResponseEntity.status(HttpStatus.FORBIDDEN).build();
|
||||
}
|
||||
return null; // The checks are passing.
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
package eu.openaire.urls_worker.exceptions;
|
||||
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.web.bind.annotation.ResponseStatus;
|
||||
|
||||
@ResponseStatus(HttpStatus.INTERNAL_SERVER_ERROR)
|
||||
public class FileStorageException extends RuntimeException {
|
||||
|
||||
public FileStorageException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public FileStorageException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
}
|
|
@ -17,7 +17,8 @@ import java.sql.Timestamp;
|
|||
"size",
|
||||
"hash",
|
||||
"location",
|
||||
"provenance"
|
||||
"provenance",
|
||||
"datasourceId"
|
||||
})
|
||||
public class Payload {
|
||||
|
||||
|
@ -48,9 +49,13 @@ public class Payload {
|
|||
@JsonProperty("provenance")
|
||||
private String provenance; // "crawl:<PluginName>"
|
||||
|
||||
@JsonProperty("datasourceId")
|
||||
private String datasourceId;
|
||||
|
||||
|
||||
public Payload() {}
|
||||
|
||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) {
|
||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
|
||||
this.id = id;
|
||||
this.original_url = original_url;
|
||||
this.actual_url = actual_url;
|
||||
|
@ -60,6 +65,7 @@ public class Payload {
|
|||
this.hash = hash;
|
||||
this.location = location;
|
||||
this.provenance = provenance;
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
|
@ -134,18 +140,29 @@ public class Payload {
|
|||
this.provenance = provenance;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Payload{" +
|
||||
"id='" + id + '\'' +
|
||||
", original_url='" + original_url + '\'' +
|
||||
", actual_url='" + actual_url + '\'' +
|
||||
", timestamp_acquired='" + timestamp_acquired + '\'' +
|
||||
", timestamp_acquired=" + timestamp_acquired +
|
||||
", mime_type='" + mime_type + '\'' +
|
||||
", size='" + size + '\'' +
|
||||
", size=" + size +
|
||||
", hash='" + hash + '\'' +
|
||||
", location='" + location + '\'' +
|
||||
", provenance='" + provenance + '\'' +
|
||||
", datasourceId='" + datasourceId + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.openaire.urls_worker.payloads.responces;
|
|||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonPropertyOrder;
|
||||
import com.google.gson.Gson;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -16,6 +17,8 @@ import java.util.List;
|
|||
})
|
||||
public class WorkerReport {
|
||||
|
||||
private static final Gson gson = new Gson(); // This is "transient" be default. It won't be included in any json object.
|
||||
|
||||
@JsonProperty("workerId")
|
||||
private String workerId;
|
||||
|
||||
|
@ -55,6 +58,11 @@ public class WorkerReport {
|
|||
this.urlReports = urlReports;
|
||||
}
|
||||
|
||||
public String getJsonReport() {
|
||||
return gson.toJson(this, WorkerReport.class);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "WorkerReport{" +
|
||||
|
|
|
@ -1,237 +0,0 @@
|
|||
package eu.openaire.urls_worker.plugins;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
|
||||
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
||||
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
import eu.openaire.urls_worker.models.Assignment;
|
||||
import eu.openaire.urls_worker.models.Error;
|
||||
import eu.openaire.urls_worker.models.Payload;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
import eu.openaire.urls_worker.services.FileStorageService;
|
||||
import eu.openaire.urls_worker.util.AssignmentsHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.Timestamp;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
|
||||
public class PublicationsRetrieverPlugin {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
|
||||
|
||||
public static String assignmentsBasePath;
|
||||
|
||||
|
||||
public PublicationsRetrieverPlugin() {
|
||||
// Specify some configurations
|
||||
LoaderAndChecker.retrieveDocuments = true;
|
||||
LoaderAndChecker.retrieveDatasets = false;
|
||||
ConnSupportUtils.setKnownMimeTypes();
|
||||
FileUtils.shouldDownloadDocFiles = true;
|
||||
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
|
||||
PublicationsRetriever.targetUrlType = "docUrl";
|
||||
FileUtils.jsonBatchSize = UrlsWorkerApplication.maxAssignmentsLimitPerBatch;
|
||||
|
||||
assignmentsBasePath = FileStorageService.assignmentsLocation.toString();
|
||||
if ( !assignmentsBasePath.endsWith(File.separator) )
|
||||
assignmentsBasePath += File.separator;
|
||||
|
||||
ConnSupportUtils.shouldBlockMost5XXDomains = false;
|
||||
LoaderAndChecker.setCouldRetryRegex();
|
||||
|
||||
PublicationsRetriever.threadsMultiplier = 4;
|
||||
int workerThreadsCount = Runtime.getRuntime().availableProcessors() * PublicationsRetriever.threadsMultiplier;
|
||||
logger.info("Use " + workerThreadsCount + " worker-threads.");
|
||||
PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
|
||||
}
|
||||
|
||||
private static final List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);
|
||||
|
||||
public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException
|
||||
{
|
||||
FileUtils.storeDocFilesDir = assignmentsBasePath + "assignments_" + assignmentRequestCounter + "_fullTexts" + File.separator; // It needs the last separator, because of how the docFiles are named and stored.
|
||||
|
||||
File curAssignmentsDirs = new File(FileUtils.storeDocFilesDir);
|
||||
try {
|
||||
if ( !curAssignmentsDirs.exists() ) {
|
||||
if ( !curAssignmentsDirs.mkdirs() ) { // Create the directories.
|
||||
String workingDir = System.getProperty("user.dir") + File.separator;
|
||||
logger.error("Could not create the \"assignments_fullTexts directories\": \"" + FileUtils.storeDocFilesDir + "\". Using the \"workingDir\" instead (" + workingDir + ").");
|
||||
FileUtils.storeDocFilesDir = assignmentsBasePath = workingDir;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Failed to create the full-texts directory for assignments_" + assignmentRequestCounter;
|
||||
logger.error(errorMsg, e);
|
||||
throw new RuntimeException(errorMsg + ": " + e.getMessage());
|
||||
}
|
||||
|
||||
// Start loading and checking urls.
|
||||
for ( Assignment assignment : assignments )
|
||||
{
|
||||
callableTasks.add(() -> {
|
||||
String id = assignment.getId();
|
||||
String url = assignment.getOriginalUrl();
|
||||
|
||||
if ( (id == null) || id.isEmpty() || (url == null) || url.isEmpty() ) {
|
||||
String errorMsg = "Got null or empty pair! ID=" + id + " , url=" + url;
|
||||
logger.warn(errorMsg);
|
||||
UrlUtils.logOutputData(id, url, null, "unreachable", "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null ) {
|
||||
return false;
|
||||
} // The "url" might have changed (inside "handleUrlChecks()").
|
||||
|
||||
String urlToCheck = url;
|
||||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
|
||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) { // If we got into an already-found docUrl, log it and return.
|
||||
ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isPossibleDocOrDatasetUrl = false; // Used for specific connection settings.
|
||||
String lowerCaseRetrievedUrl = url.toLowerCase();
|
||||
// Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.
|
||||
if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())
|
||||
|| (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {
|
||||
//logger.debug("Possible docUrl or datasetUrl: " + url);
|
||||
isPossibleDocOrDatasetUrl = true;
|
||||
}
|
||||
|
||||
try { // Check if it's a docUrl, if not, it gets crawled.
|
||||
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
|
||||
} catch (Exception e) {
|
||||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
int numFailedTasks = LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
|
||||
if ( numFailedTasks == -1 ) { // The unknown exception is logged inside the above method.
|
||||
System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!");
|
||||
UrlsWorkerApplication.gentleAppShutdown();
|
||||
}
|
||||
|
||||
if ( numFailedTasks > 0 )
|
||||
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
||||
|
||||
addUrlReportsToWorkerReport();
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
|
||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
|
||||
}
|
||||
|
||||
|
||||
public static void addUrlReportsToWorkerReport()
|
||||
{
|
||||
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
||||
|
||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||
{
|
||||
UrlReport.StatusType status = null;
|
||||
String fileLocation = null, comment = data.getComment(), mimeType = null, hash = data.getHash();
|
||||
Long size = data.getSize();
|
||||
Error error = null;
|
||||
|
||||
if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE.
|
||||
{
|
||||
status = UrlReport.StatusType.accessible;
|
||||
if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) {
|
||||
// The file of this docUrl was already downloaded by another docUrl.
|
||||
int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
|
||||
int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
|
||||
String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message".
|
||||
String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl);
|
||||
//logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl); // DEBUG!
|
||||
// Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
|
||||
boolean foundAlreadyDownloadedFullText = false;
|
||||
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
|
||||
if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))
|
||||
&& ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) {
|
||||
fileLocation = data_2.getComment();
|
||||
size = data_2.getSize();
|
||||
hash = data_2.getHash();
|
||||
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
foundAlreadyDownloadedFullText = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// In case the "alreadyDownloaded" full-text is not found, we have an error.
|
||||
if ( !foundAlreadyDownloadedFullText )
|
||||
error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future.
|
||||
}
|
||||
else if ( ! comment.equals(HttpConnUtils.docFileNotRetrievedMessage) ) { // If it was downloaded without an error.
|
||||
fileLocation = comment; // This is the full-file-path.
|
||||
mimeType = "application/pdf";
|
||||
} else // Else the file was not retrieved, so all file-related data are kept "null".
|
||||
error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future.
|
||||
|
||||
if ( error == null ) // If the file was retrieved, in any time.
|
||||
error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" Error-object, since it just adds more complicated handling in the controller..
|
||||
}
|
||||
else {
|
||||
status = UrlReport.StatusType.non_accessible;
|
||||
if ( "true".equals(data.getCouldRetry()) )
|
||||
error = new Error(Error.ErrorType.couldRetry, comment);
|
||||
else
|
||||
error = new Error(Error.ErrorType.noRetry, comment);
|
||||
}
|
||||
|
||||
String docOrDatasetUrl = data.getDocOrDatasetUrl();
|
||||
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
|
||||
docOrDatasetUrl = null;
|
||||
|
||||
// Convert "null" strings to actual < null >
|
||||
if ( (hash != null) && (hash.equals("null")) )
|
||||
hash = null;
|
||||
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
|
||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
|
||||
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||
}// end-for
|
||||
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
|
||||
}
|
||||
|
||||
|
||||
public static boolean connectWithUrlTest(String urlToCheck) {
|
||||
String testID = "testID";
|
||||
try {
|
||||
return HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
|
||||
} catch (Exception e) {
|
||||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
UrlUtils.logOutputData(testID, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,13 +1,13 @@
|
|||
package eu.openaire.urls_worker.security;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.security.config.annotation.method.configuration.EnableGlobalMethodSecurity;
|
||||
import org.springframework.security.config.annotation.web.builders.HttpSecurity;
|
||||
import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity;
|
||||
import org.springframework.security.config.annotation.web.configuration.WebSecurityConfigurerAdapter;
|
||||
import org.springframework.security.config.http.SessionCreationPolicy;
|
||||
import org.springframework.security.web.SecurityFilterChain;
|
||||
|
||||
|
||||
@Configuration
|
||||
|
@ -17,14 +17,10 @@ import org.springframework.security.config.http.SessionCreationPolicy;
|
|||
jsr250Enabled = true,
|
||||
prePostEnabled = true
|
||||
)
|
||||
public class SecurityConfiguration extends WebSecurityConfigurerAdapter {
|
||||
public class SecurityConfiguration {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SecurityConfiguration.class);
|
||||
|
||||
|
||||
// Defines which resources are public and which are secured.
|
||||
@Override
|
||||
protected void configure(HttpSecurity http) throws Exception {
|
||||
@Bean
|
||||
public SecurityFilterChain filterChain(HttpSecurity http) throws Exception {
|
||||
http
|
||||
.headers()
|
||||
.frameOptions()
|
||||
|
@ -46,5 +42,7 @@ public class SecurityConfiguration extends WebSecurityConfigurerAdapter {
|
|||
//.requiresChannel()
|
||||
//.anyRequest().requiresSecure()
|
||||
;
|
||||
return http.build();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,20 +1,15 @@
|
|||
package eu.openaire.urls_worker.services;
|
||||
|
||||
import eu.openaire.urls_worker.exceptions.FileStorageException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.core.io.UrlResource;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Properties;
|
||||
|
||||
|
||||
@Service
|
||||
|
@ -22,45 +17,26 @@ public class FileStorageService {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FileStorageService.class);
|
||||
|
||||
public static Path assignmentsLocation = null;
|
||||
|
||||
static {
|
||||
String springPropertiesFile = System.getProperty("user.dir") + File.separator + "src" + File.separator + "main" + File.separator + "resources" + File.separator + "application.properties";
|
||||
FileReader fReader = null;
|
||||
try {
|
||||
fReader = new FileReader(springPropertiesFile);
|
||||
Properties props = new Properties();
|
||||
props.load(fReader); // Load jdbc related properties.
|
||||
String assignmentsDir = props.getProperty("file.assignments-dir");
|
||||
assignmentsLocation = Paths.get(assignmentsDir).toAbsolutePath().normalize();
|
||||
} catch (java.io.FileNotFoundException fnfe) {
|
||||
logger.error("The properties file was not found!", fnfe);
|
||||
System.exit(-10);
|
||||
} catch (IOException ioe) {
|
||||
logger.error("I/O error when reading the properties file!", ioe);
|
||||
System.exit(-11);
|
||||
}
|
||||
}
|
||||
public String assignmentsBaseLocation = null;
|
||||
|
||||
|
||||
@Autowired
|
||||
public FileStorageService() throws FileStorageException {
|
||||
try {
|
||||
Files.createDirectories(assignmentsLocation);
|
||||
} catch (Exception ex) {
|
||||
throw new FileStorageException("Could not create the directory where the uploaded files will be stored.", ex);
|
||||
}
|
||||
}
|
||||
public FileStorageService(@Value("${file.assignments-dir}") String assignmentsBaseLocation) {
|
||||
this.assignmentsBaseLocation = assignmentsBaseLocation;
|
||||
|
||||
// In case the user-defined storageDir starts with "./", then replace that part with the actual user.dir", in order to have valid storage-locations for fileName-extraction in the Controller, even if the files are correctly downloaded there.
|
||||
if ( this.assignmentsBaseLocation.startsWith("." + File.separator) )
|
||||
this.assignmentsBaseLocation = ((System.getProperty("user.dir") + File.separator) + StringUtils.replace(this.assignmentsBaseLocation, ("." + File.separator), "", 1));
|
||||
|
||||
public Resource loadFileAsResource(String fullFileName) {
|
||||
if ( !this.assignmentsBaseLocation.endsWith(File.separator) )
|
||||
this.assignmentsBaseLocation += File.separator;
|
||||
|
||||
// Create the base-directory.
|
||||
try {
|
||||
Path filePath = assignmentsLocation.resolve(fullFileName).normalize();
|
||||
Resource resource = new UrlResource(filePath.toUri());
|
||||
return resource.exists() ? resource : null;
|
||||
Files.createDirectories(Paths.get(this.assignmentsBaseLocation));
|
||||
} catch (Exception e) {
|
||||
logger.error("Error when loading file: " + fullFileName, e);
|
||||
return null;
|
||||
logger.error("Could not create the base-directory where the downloaded files will be stored!", e);
|
||||
System.exit(-10);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,180 +0,0 @@
|
|||
package eu.openaire.urls_worker.util;
|
||||
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.Multimap;
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
import eu.openaire.urls_worker.components.ScheduledTasks;
|
||||
import eu.openaire.urls_worker.models.Assignment;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
import eu.openaire.urls_worker.payloads.requests.AssignmentsRequest;
|
||||
import eu.openaire.urls_worker.payloads.responces.WorkerReport;
|
||||
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.boot.web.client.RestTemplateBuilder;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.client.RestClientException;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public class AssignmentsHandler {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AssignmentsHandler.class);
|
||||
|
||||
public static boolean isAvailableForWork = true;
|
||||
public static List<UrlReport> urlReports = null;
|
||||
private static final int expectedDatasourcesPerRequest = 1400; // Per 10_000 assignments.
|
||||
public static Multimap<String, Assignment> assignmentsForPlugins = null;
|
||||
private static final boolean askForTest = false; // Enable this only for testing.
|
||||
|
||||
private static final Duration requestConnectTimeoutDuration = Duration.ofMinutes(1); // 1 minute.
|
||||
private static final Duration requestReadTimeoutDuration = Duration.ofMinutes(60); // 60 minutes. Time to wait for the data to get transferred over the network. Many workers may try to get assignments from the Worker, so each worker might have to wait some 10s of minutes for work.
|
||||
// The controller has to retrieve the data from the database, then prepare them in memory, insert them in the "assignment"-table and, finally, return them to the worker.
|
||||
|
||||
public static final RestTemplate restTemplate = new RestTemplateBuilder().setConnectTimeout(requestConnectTimeoutDuration).setReadTimeout(requestReadTimeoutDuration).build();
|
||||
|
||||
public static long numHandledAssignmentsBatches = 0; // No need to be synchronized.
|
||||
|
||||
|
||||
public AssignmentsHandler()
|
||||
{
|
||||
urlReports = new ArrayList<>(UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
|
||||
int expectedAssignmentsPerDatasource = (UrlsWorkerApplication.maxAssignmentsLimitPerBatch / expectedDatasourcesPerRequest);
|
||||
assignmentsForPlugins = HashMultimap.create(expectedDatasourcesPerRequest, expectedAssignmentsPerDatasource);
|
||||
}
|
||||
|
||||
|
||||
public static AssignmentsRequest requestAssignments()
|
||||
{
|
||||
String requestUrl = UrlsWorkerApplication.controllerBaseUrl + "urls" + (askForTest ? "/test" : "") + "?workerId=" + UrlsWorkerApplication.workerId + "&workerAssignmentsLimit=" + UrlsWorkerApplication.maxAssignmentsLimitPerBatch;
|
||||
logger.info("Going to request assignments from the controller-server: " + requestUrl);
|
||||
|
||||
AssignmentsRequest assignmentRequest = null;
|
||||
try { // Here, the HTTP-request is executed.
|
||||
assignmentRequest = restTemplate.getForObject(requestUrl, AssignmentsRequest.class);
|
||||
} catch (RestClientException rce) {
|
||||
logger.error("Could not retrieve the assignments!\n" + rce.getMessage()); // It shows the response body (after Spring v.2.5.6).
|
||||
return null;
|
||||
}
|
||||
|
||||
//logger.debug(assignmentRequest.toString()); // DEBUG!
|
||||
return assignmentRequest;
|
||||
}
|
||||
|
||||
|
||||
public static void handleAssignments()
|
||||
{
|
||||
AssignmentsRequest assignmentsRequest = requestAssignments();
|
||||
if ( assignmentsRequest == null )
|
||||
return;
|
||||
|
||||
Long assignmentRequestCounter = assignmentsRequest.getAssignmentsCounter();
|
||||
List<Assignment> assignments = assignmentsRequest.getAssignments();
|
||||
if ( assignments == null ) {
|
||||
logger.warn("The assignments were found to be null for assignmentRequestCounter = " + assignmentRequestCounter);
|
||||
return;
|
||||
}
|
||||
|
||||
int assignmentsSize = assignments.size();
|
||||
if ( assignmentsSize == 0 ) {
|
||||
logger.warn("The assignmentsSize was < 0 > for assignmentRequestCounter = " + assignmentRequestCounter);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("AssignmentRequest < " + assignmentRequestCounter + " > was received and it's ready to be processed. It contains " + assignmentsSize + " tasks.");
|
||||
|
||||
// Start handling the assignments, the worker is busy.
|
||||
isAvailableForWork = false;
|
||||
|
||||
// Iterate over the tasks and add each task in its own list depending on the DATASOURCE in order to decide which plugin to use later.
|
||||
|
||||
for ( Assignment assignment : assignments ) {
|
||||
// Add each task in its own HashSet.
|
||||
try {
|
||||
assignmentsForPlugins.put(assignment.getDatasource().getId(), assignment);
|
||||
} catch (NullPointerException npe) {
|
||||
logger.warn("An NPE was thrown when splitting the assignments based on the datasource-types. The assignment was: " + assignment); // Do not use "assignment.toString()", it may cause an NPE.
|
||||
}
|
||||
}
|
||||
|
||||
//countDatasourcesAndRecords(assignmentsSize); // Only for DEBUG! Keep it commented in normal run.
|
||||
|
||||
// TODO - Decide which tasks run with what plugin (depending on their datasource).
|
||||
// First run -in parallel- the tasks which require some specific plugin.
|
||||
// Then run the remaining tasks in the generic plugin (which handles parallelism itself).
|
||||
|
||||
// For now, let's just run all tasks in the generic plugin.
|
||||
try {
|
||||
PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when processing the assignments_" + assignmentRequestCounter, e);
|
||||
} // In this case, we will either have an empty WorkerReport or a half-filled one. Either way, we want to report back to the Controller.
|
||||
|
||||
// TODO - If we have more than one plugin running at the same time, then make the "AssignmentsHandler.urlReports"-list thread-safe.
|
||||
|
||||
if ( askForTest ) {
|
||||
logger.debug("UrlReports:"); // DEBUG!
|
||||
for ( UrlReport urlReport : urlReports )
|
||||
logger.debug(urlReport.toString());
|
||||
} // Avoid posting the results in "askForTestUrls"-mode. We don't want for test-results to be written into the database by the controller.
|
||||
else
|
||||
postWorkerReport(assignmentRequestCounter);
|
||||
|
||||
numHandledAssignmentsBatches ++; // This is used later to stop this app, when a user-defined upper limit is reached.
|
||||
|
||||
isAvailableForWork = true; // State this after posting, to avoid breaking the "UrlReports" in the current or the next run.
|
||||
// Also, since the worker has limited resources, it's better to finish sending the full-texts first and then request a new batch of assignments.
|
||||
|
||||
// Note: Cannot call this method here retrospectively, as if it runs 100s of times, the memory-stack may break..
|
||||
// The scheduler will handle calling it every 15 mins, in case the Worker is available for work..
|
||||
|
||||
if ( AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeRestart )
|
||||
{
|
||||
logger.info("The maximum assignments-batches (" + UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeRestart + ") to be handled was reached! Shut down, in order for the external Linux-service to restart on its own..");
|
||||
UrlsWorkerApplication.gentleAppShutdown();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean postWorkerReport(Long assignmentRequestCounter)
|
||||
{
|
||||
String postUrl = UrlsWorkerApplication.controllerBaseUrl + "urls/addWorkerReport";
|
||||
logger.info("Going to post the WorkerReport of assignment_" + assignmentRequestCounter + " to the controller-server: " + postUrl);
|
||||
try {
|
||||
ResponseEntity<String> responseEntity = restTemplate.postForEntity(postUrl, new WorkerReport(UrlsWorkerApplication.workerId, assignmentRequestCounter, urlReports), String.class);
|
||||
int responseCode = responseEntity.getStatusCodeValue();
|
||||
if ( responseCode == HttpStatus.OK.value() ) {
|
||||
logger.info("The submission of the WorkerReport of assignments_" + assignmentRequestCounter + " to the Controller, and the full-text delivering, were successful!");
|
||||
return true;
|
||||
} else {
|
||||
logger.error("HTTP-Connection problem with the submission of the WorkerReport of assignment_" + assignmentRequestCounter + " to the Controller. Error-code was: " + responseCode);
|
||||
return false;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Error when submitting the WorkerReport of assignment_" + assignmentRequestCounter + " to the Controller: ", e);
|
||||
return false;
|
||||
} finally {
|
||||
urlReports.clear(); // Reset, without de-allocating.
|
||||
assignmentsForPlugins.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void countDatasourcesAndRecords(int assignmentsSize)
|
||||
{
|
||||
Set<String> datasources = assignmentsForPlugins.keySet();
|
||||
int numDatasources = datasources.size();
|
||||
logger.debug("Num of datasources: " + numDatasources);
|
||||
for ( String datasource : datasources ) {
|
||||
logger.debug("Num of records for datasource \"" + datasource + "\" is: " + assignmentsForPlugins.get(datasource).size() );
|
||||
}
|
||||
logger.debug("Average num of records per datasource: " + (assignmentsSize / numDatasources));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
package eu.openaire.urls_worker.util;
|
||||
|
||||
import eu.openaire.urls_worker.controllers.FullTextsController;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.compressors.zstandard.ZstdCompressorOutputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.NoSuchFileException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
public class FilesCompressor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FilesCompressor.class);
|
||||
|
||||
public static final int bufferSize = (5 * 1_048_576); // 5 Mb
|
||||
|
||||
|
||||
public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List<String> filesToCompress, String baseDirectory)
|
||||
{
|
||||
// For example: assignments_2_full-texts_4.tar.zstd | where < 4 > is referred to the 4th batch of files requested by the Controller.
|
||||
File tarFile;
|
||||
try {
|
||||
tarFile = getTarArchiveWithFullTexts(filesToCompress, baseDirectory, assignmentsCounter, tarBatchCounter);
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when creating the tar-file for assignments_" + assignmentsCounter, e);
|
||||
return null;
|
||||
} finally {
|
||||
// Delete the files of this failed batch immediately. These files will not be requested again. The urls leading to these file will be reprocessed in the future.
|
||||
for ( String fileName : filesToCompress )
|
||||
FullTextsController.deleteFile(baseDirectory + fileName);
|
||||
}
|
||||
|
||||
// The "TAR" archive is not compressed, but it helps deliver multiple full-texts with a single Stream.
|
||||
// Then, we compress the archive, using Facebook's "ZStandard" algorithm, which delivers both high compression-rate and compression and decompression efficiency.
|
||||
|
||||
String tarFilePath = tarFile.getPath();
|
||||
String zStandardFileFullPath = tarFilePath + ".zstd";
|
||||
File zStandardFile = new File(zStandardFileFullPath);
|
||||
|
||||
try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize);
|
||||
ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath())), bufferSize) )
|
||||
{
|
||||
int readByte;
|
||||
while ( (readByte = in.read()) != -1 ) {
|
||||
zOut.write(readByte);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when compressing the tar-archive: " + tarFilePath, e);
|
||||
return null;
|
||||
} finally {
|
||||
FullTextsController.deleteFile(tarFilePath);
|
||||
}
|
||||
|
||||
logger.debug("Finished archiving and compressing the full-texts of assignments_" + assignmentsCounter + ", batch_" + tarBatchCounter);
|
||||
return zStandardFile;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method adds the requested full-text file into a TAR archive, which later will be compressed.
|
||||
* */
|
||||
private static File getTarArchiveWithFullTexts(List<String> filesToTar, String baseDir, long assignmentsCounter, int tarBatchCounter) throws Exception
|
||||
{
|
||||
String tarFileFullPath = baseDir + "assignments_" + assignmentsCounter + "_full-texts_" + tarBatchCounter + ".tar";
|
||||
// For example: assignments_2_full-texts_4.tar.zstd | where < 4 > is referred to the 4th batch of files requested by the Controller.
|
||||
|
||||
// https://commons.apache.org/proper/commons-compress/examples.html
|
||||
|
||||
int numTarredFiles = 0;
|
||||
File tarFile = new File(tarFileFullPath);
|
||||
|
||||
try ( TarArchiveOutputStream taos = new TarArchiveOutputStream(new BufferedOutputStream(Files.newOutputStream(tarFile.toPath()), bufferSize)) )
|
||||
{
|
||||
for ( String fileName : filesToTar ) {
|
||||
if ( addTarEntry(taos, fileName, baseDir) )
|
||||
numTarredFiles ++;
|
||||
}
|
||||
}
|
||||
|
||||
if ( numTarredFiles == 0 ) {
|
||||
throw new RuntimeException("None of the requested (" + filesToTar.size() + ") could be tarred, for assignments_" + assignmentsCounter + ", batch_" + tarBatchCounter);
|
||||
} else if ( numTarredFiles != filesToTar.size() )
|
||||
logger.warn("The number of \"numTarredFiles\" (" + numTarredFiles + ") is different from the number of files requested to be tarred (" + filesToTar.size() + "), for assignments_" + assignmentsCounter + ", batch_" + tarBatchCounter);
|
||||
// Still, some files may have been tarred, so we move on. It's up to the Controller, to handle such case.
|
||||
|
||||
return tarFile;
|
||||
}
|
||||
|
||||
|
||||
private static boolean addTarEntry(TarArchiveOutputStream taos, String fileName, String baseDir)
|
||||
{
|
||||
boolean shouldCloseEntry = false; // Useful in order to know if we should close the entry (an Exception may appear when initializing the stream, and so we should not try to close it).
|
||||
|
||||
Path fullFileNamePath = Paths.get(baseDir + fileName);
|
||||
try ( BufferedInputStream fis = new BufferedInputStream(Files.newInputStream(fullFileNamePath), bufferSize) ) {
|
||||
TarArchiveEntry entry = new TarArchiveEntry(fileName);
|
||||
entry.setSize(Files.size(fullFileNamePath)); // Yes, tar requires that we set the size beforehand..
|
||||
taos.putArchiveEntry(entry);
|
||||
shouldCloseEntry = true;
|
||||
|
||||
int readByte;
|
||||
while ( (readByte = fis.read()) != -1 ) {
|
||||
taos.write(readByte);
|
||||
}
|
||||
} catch (NoSuchFileException nsfe) {
|
||||
logger.error("NoSuchFileException: " + nsfe.getMessage());
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
logger.error("", e);
|
||||
return false;
|
||||
} finally {
|
||||
if ( shouldCloseEntry ) {
|
||||
try {
|
||||
taos.closeArchiveEntry(); // close just the TarEntry here (not the TarArchiveOutputStream)
|
||||
} catch (IOException e) {
|
||||
logger.error("", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
package eu.openaire.urls_worker.util;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
|
||||
public class FilesZipper
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(FilesZipper.class);
|
||||
|
||||
|
||||
public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List<String> filesToZip, String baseDirectory)
|
||||
{
|
||||
String zipFilename = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip";
|
||||
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the controller.
|
||||
|
||||
int numZippedFiles = 0;
|
||||
File zipFile = new File(zipFilename);
|
||||
try ( ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(zipFile), StandardCharsets.UTF_8) )
|
||||
{
|
||||
for ( String file : filesToZip ) {
|
||||
if ( zipAFile(file, zos, baseDirectory) )
|
||||
numZippedFiles ++;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when creating the zip-file: " + zipFilename, e);
|
||||
return null;
|
||||
}
|
||||
logger.debug("Zipped " + numZippedFiles + " files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter);
|
||||
return zipFile;
|
||||
}
|
||||
|
||||
|
||||
private static final int BUFFER_SIZE = 3145728; // 3MB (average fullText-size)
|
||||
private static final byte[] dataBuffer = new byte[BUFFER_SIZE];
|
||||
|
||||
// This method is "synchronized" to avoid any future problems with shared-buffer, if the requests are asynchronous.
|
||||
private static synchronized boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir)
|
||||
{
|
||||
boolean shouldCloseEntry = false; // Useful in order to close the entry in case of an exception.
|
||||
String fullFileName = baseDir + fileName;
|
||||
try ( BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName), BUFFER_SIZE) ) {
|
||||
zos.putNextEntry(new ZipEntry(fileName));
|
||||
shouldCloseEntry = true;
|
||||
int count;
|
||||
while ( (count = bis.read(dataBuffer, 0, BUFFER_SIZE)) != -1 ) {
|
||||
zos.write(dataBuffer, 0, count);
|
||||
}
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
logger.error("Error zipping file: " + fullFileName, fnfe.getMessage());
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
if ( ! e.getMessage().contains("duplicate") )
|
||||
logger.error("Error zipping file: " + fullFileName, e);
|
||||
return false;
|
||||
} finally {
|
||||
if ( shouldCloseEntry ) {
|
||||
try {
|
||||
zos.closeEntry(); // close the entry here (not the ZipOutputStream)
|
||||
} catch (IOException e) {
|
||||
logger.error("", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -3,21 +3,24 @@ package eu.openaire.urls_worker.util;
|
|||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.boot.web.servlet.context.ServletWebServerApplicationContext;
|
||||
import org.springframework.core.env.Environment;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
||||
public class UriBuilder {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UriBuilder.class);
|
||||
|
||||
|
||||
public static String ip = null;
|
||||
public static String baseUrl = null;
|
||||
|
||||
public UriBuilder(Environment environment) {
|
||||
public UriBuilder(Environment environment, ServletWebServerApplicationContext webServerAppCtxt) {
|
||||
baseUrl = "http";
|
||||
|
||||
String sslEnabled = environment.getProperty("server.ssl.enabled");
|
||||
|
@ -26,21 +29,12 @@ public class UriBuilder {
|
|||
sslEnabled = "false";
|
||||
}
|
||||
baseUrl += sslEnabled.equals("true") ? "s" : "";
|
||||
|
||||
baseUrl += "://";
|
||||
|
||||
String hostName = getPublicIP();
|
||||
if ( hostName == null )
|
||||
hostName = InetAddress.getLoopbackAddress().getHostName(); // Non-null.
|
||||
if ( (ip = getPublicIP()) == null )
|
||||
ip = InetAddress.getLoopbackAddress().getHostAddress(); // Non-null.
|
||||
|
||||
baseUrl += hostName;
|
||||
|
||||
String serverPort = environment.getProperty("server.port");
|
||||
if (serverPort == null) { // This is unacceptable!
|
||||
logger.error("No property \"server.port\" was found in \"application.properties\"!");
|
||||
System.exit(-1); // Well, I guess the Spring Boot would not start in this case anyway.
|
||||
}
|
||||
baseUrl += ":" + serverPort;
|
||||
baseUrl += ip + ":" + webServerAppCtxt.getWebServer().getPort();
|
||||
|
||||
String baseInternalPath = environment.getProperty("server.servlet.context-path");
|
||||
if ( baseInternalPath != null ) {
|
||||
|
@ -60,18 +54,30 @@ public class UriBuilder {
|
|||
private static String getPublicIP()
|
||||
{
|
||||
String publicIpAddress = "";
|
||||
URL url_name;
|
||||
HttpURLConnection conn = null;
|
||||
String urlString = "https://checkip.amazonaws.com/";
|
||||
try {
|
||||
url_name = new URL("https://api.ipify.org/");
|
||||
} catch (MalformedURLException mue) {
|
||||
logger.warn(mue.getMessage());
|
||||
return null;
|
||||
}
|
||||
try ( BufferedReader bf = new BufferedReader(new InputStreamReader(url_name.openStream()))) {
|
||||
publicIpAddress = bf.readLine().trim();
|
||||
conn = (HttpURLConnection) new URL(urlString).openConnection();
|
||||
conn.setConnectTimeout(60_000); // 1 minute
|
||||
conn.setReadTimeout(120_000); // 2 minutes
|
||||
conn.setRequestMethod("GET");
|
||||
conn.connect();
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
if ( responseCode != 200 ) {
|
||||
logger.warn("Cannot get the publicIP address for this machine, as \"" + urlString + "\" returned the HTTP-error-code: " + responseCode);
|
||||
return null;
|
||||
}
|
||||
|
||||
try ( BufferedReader bf = new BufferedReader(new InputStreamReader(conn.getInputStream()))) {
|
||||
publicIpAddress = bf.readLine().trim();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("Cannot get the publicIP address for this machine!", e);
|
||||
logger.warn("Cannot get the publicIP address for this machine, from \"" + urlString + "\"!", e);
|
||||
return null;
|
||||
} finally {
|
||||
if ( conn != null )
|
||||
conn.disconnect();
|
||||
}
|
||||
return publicIpAddress;
|
||||
}
|
||||
|
@ -84,4 +90,4 @@ public class UriBuilder {
|
|||
UriBuilder.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
package eu.openaire.urls_worker.util;
|
||||
|
||||
public interface WorkerConstants {
|
||||
|
||||
int ASSIGNMENTS_LIMIT = 10_000;
|
||||
|
||||
}
|
|
@ -11,12 +11,30 @@
|
|||
|
||||
# HTTP CONFIGURATION
|
||||
server.port = 1881
|
||||
# You can set the above value to < 0 >, in order to choose a random port (it will automatically choose a new random port, if the previously chosen is already in use)..
|
||||
|
||||
# Server api path
|
||||
server.servlet.context-path=/api
|
||||
server.shutdown=graceful
|
||||
|
||||
spring.lifecycle.timeout-per-shutdown-phase=2m
|
||||
|
||||
#Input data configurations
|
||||
info.workerId = XX
|
||||
info.maxAssignmentsLimitPerBatch = 10000
|
||||
# If the "info.maxAssignmentsBatchesToHandleBeforeShutdown" is zero, then an infinite number of assignments-batches will be handled.
|
||||
info.maxAssignmentsBatchesToHandleBeforeShutdown = 0
|
||||
info.controllerIp = XX
|
||||
info.controllerPort = XX
|
||||
info.controllerBaseUrl = http://${info.controllerIp}:${info.controllerPort}/api/
|
||||
|
||||
|
||||
workerReportsDirPath: ${HOME}/workerReports/
|
||||
|
||||
|
||||
# LOGGING LEVELS
|
||||
logging.config=classpath:logback-spring.xml
|
||||
logging.file.path=logs
|
||||
logging.level.root=INFO
|
||||
logging.level.org.springframework.web=INFO
|
||||
logging.level.org.springframework.security=WARN
|
||||
|
|
|
@ -1,31 +1,33 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<configuration debug="false">
|
||||
|
||||
<appender name="RollingFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||
<file>logs/UrlsWorker.log</file>
|
||||
<file>${LOG_PATH}/UrlsWorker.log</file>
|
||||
|
||||
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
|
||||
<fileNamePattern>logs/UrlsWorker.%i.log.zip</fileNamePattern>
|
||||
<fileNamePattern>${LOG_PATH}/UrlsWorker.%i.log.zip</fileNamePattern>
|
||||
<minIndex>1</minIndex>
|
||||
<maxIndex>20</maxIndex>
|
||||
<maxIndex>10</maxIndex>
|
||||
</rollingPolicy>
|
||||
|
||||
<triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
|
||||
<maxFileSize>50MB</maxFileSize>
|
||||
<maxFileSize>100MB</maxFileSize>
|
||||
</triggeringPolicy>
|
||||
<encoder>
|
||||
<charset>UTF-8</charset>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n</pattern>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS z} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<appender name="Console" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder>
|
||||
<charset>UTF-8</charset>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %highlight(%-5level) %cyan(%logger{36}.%M\(@%line\)) - %msg%n</pattern>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS z} [%thread] %highlight(%-5level) %cyan(%logger{36}.%M\(@%line\)) - %msg%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<root level="debug">
|
||||
<!-- <appender-ref ref="Console" /> -->
|
||||
<appender-ref ref="RollingFile" />
|
||||
</root>
|
||||
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
package eu.openaire.urls_worker;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
//import org.junit.jupiter.api.Test;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
|
||||
@SpringBootTest
|
||||
class UrlsWorkerApplicationTests {
|
||||
|
||||
@Test
|
||||
void contextLoads() {
|
||||
//@Test // TODO - Enable when the test is ready.
|
||||
void test1() {
|
||||
// TODO - Write a test.
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue