- Use newer api form "PublicationsRetriever" software.

- Optimize memory-de-allocation.
- Optimize Gradle settings.
- Set new version.
- Update dependencies.
This commit is contained in:
Lampros Smyrnaios 2024-11-13 19:10:05 +02:00
parent 3eaeff468a
commit ef33e91553
8 changed files with 34 additions and 26 deletions

View File

@ -6,7 +6,7 @@ plugins {
java {
group = 'eu.openaire.urls_worker'
version = '2.1.13'
version = '2.1.14'
sourceCompatibility = JavaVersion.VERSION_1_8
}
@ -23,7 +23,7 @@ repositories {
}
dependencies {
runtimeOnly "org.springframework.boot:spring-boot-devtools"
developmentOnly "org.springframework.boot:spring-boot-devtools"
implementation("org.springframework.boot:spring-boot-starter-web")
implementation("org.springframework.boot:spring-boot-starter-security")
@ -37,7 +37,7 @@ dependencies {
// Enable the validation annotations.
//implementation group: 'jakarta.validation', name: 'jakarta.validation-api', version: '3.0.2'
implementation ("eu.openaire:publications_retriever:1.2-SNAPSHOT") {
implementation ("eu.openaire:publications_retriever:1.3-SNAPSHOT") {
exclude group: 'ch.qos.logback', module: 'logback-core'
exclude group: 'ch.qos.logback', module: 'logback-classic'
exclude group: 'org.slf4j', module: 'slf4j-api'
@ -53,7 +53,7 @@ dependencies {
implementation("org.apache.commons:commons-compress:1.27.1") {
exclude group: 'com.github.luben', module: 'zstd-jni'
}
implementation 'com.github.luben:zstd-jni:1.5.6-6' // Even though this is part of the above dependency, the Apache commons rarely updates it, while the zstd team makes improvements very often.
implementation 'com.github.luben:zstd-jni:1.5.6-7' // Even though this is part of the above dependency, the Apache commons rarely updates it, while the zstd team makes improvements very often.
// Also, for compressing, we strangely need it to be explicitly declared independently, otherwise it does not work.
testImplementation 'org.springframework.security:spring-security-test'

View File

@ -1,4 +1,5 @@
org.gradle.caching=true
org.gradle.parallel=true
org.gradle.configuration-cache.parallel=true
org.gradle.caching.debug=false
org.gradle.warning.mode=all

Binary file not shown.

View File

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME

5
gradlew vendored
View File

@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
#
##############################################################################
#
@ -84,7 +86,8 @@ done
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s
' "$PWD" ) || exit
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum

2
gradlew.bat vendored
View File

@ -13,6 +13,8 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@rem SPDX-License-Identifier: Apache-2.0
@rem
@if "%DEBUG%"=="" @echo off
@rem ##########################################################################

View File

@ -18,7 +18,7 @@ elif [[ $# -gt 1 ]]; then
echo -e "Wrong number of arguments given: ${#} (more than 1)\nPlease execute it like: script.sh <justRun: 0 | 1>"; exit 2
fi
gradleVersion="8.8"
gradleVersion="8.11"
shouldBeCarefulWithMaxHeap=0 # This is NOT a cmd-arg.

View File

@ -1,10 +1,11 @@
package eu.openaire.urls_worker.components.plugins;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.DataForOutput;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.components.AssignmentsHandler;
@ -47,7 +48,7 @@ public class PublicationsRetrieverPlugin {
ConnSupportUtils.setKnownMimeTypes();
FileUtils.shouldDownloadDocFiles = true;
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
PublicationsRetriever.targetUrlType = "docUrl";
ArgsUtils.targetUrlType = "docUrl";
FileUtils.jsonBatchSize = maxAssignmentsLimitPerBatch;
assignmentsBasePath = fileStorageService.assignmentsBaseLocation;
@ -59,11 +60,11 @@ public class PublicationsRetrieverPlugin {
int availableProcessors = Runtime.getRuntime().availableProcessors();
if ( availableProcessors <= 4 )
PublicationsRetriever.threadsMultiplier = 10;
ArgsUtils.threadsMultiplier = 10;
else
PublicationsRetriever.threadsMultiplier = 6;
ArgsUtils.threadsMultiplier = 6;
int workerThreadsCount = (availableProcessors * PublicationsRetriever.threadsMultiplier);
int workerThreadsCount = (availableProcessors * ArgsUtils.threadsMultiplier);
logger.info("Use " + workerThreadsCount + " worker-threads.");
PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
}
@ -106,7 +107,7 @@ public class PublicationsRetrieverPlugin {
if ( (id == null) || id.isEmpty() || (url == null) || url.isEmpty() ) {
String errorMsg = "Got null or empty pair! ID=" + id + " , url=" + url;
logger.warn(errorMsg);
UrlUtils.logOutputData(id, url, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
UrlUtils.addOutputData(id, url, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
return false;
}
@ -117,7 +118,7 @@ public class PublicationsRetrieverPlugin {
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(sourceUrl)) == null ) {
logger.warn("Could not normalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
UrlUtils.addOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
@ -143,7 +144,7 @@ public class PublicationsRetrieverPlugin {
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
String errorMsg = "Discarded at loading time, as " + list.get(2);
UrlUtils.logOutputData(id, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
UrlUtils.addOutputData(id, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
return true;
@ -161,13 +162,14 @@ public class PublicationsRetrieverPlugin {
if ( numFailedTasks > 0 )
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
ConnSupportUtils.domainsWithConnectionData.clear(); // This data is not useful for the next batch, since plenty of time will have passed before needing to check the "lastConnectedTime" for each domain, in order to apply the "politenessDelay".
addUrlReportsToWorkerReport(assignments);
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
// In the next batch, the previously stored files might have been already delivered to the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
ConnSupportUtils.domainsWithConnectionData.clear(); // This data is not useful for the next batch, since plenty of time will have passed before needing to check the "lastConnectedTime" for each domain, in order to apply the "politenessDelay".
//logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); // debug!
boolean cookiesDeleted = cookieStore.removeAll();
@ -183,11 +185,11 @@ public class PublicationsRetrieverPlugin {
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
{
if ( FileUtils.dataToBeLoggedList.size() != assignments.size() ) {
logger.warn("The number of the results (" + FileUtils.dataToBeLoggedList.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!");
if ( FileUtils.dataForOutput.size() != assignments.size() ) {
logger.warn("The number of the results (" + FileUtils.dataForOutput.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!");
} // TODO - Should any other step be taken, except from just showing the log-message?
// Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataToBeLogged" objects.
// Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataForOutput" objects.
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
for ( Assignment assignment : assignments )
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
@ -195,7 +197,7 @@ public class PublicationsRetrieverPlugin {
int numOfUnretrievedFiles = 0;
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
for ( DataForOutput data : FileUtils.dataForOutput )
{
// TODO - Consider adding multi-thread execution for the following code.
// In that case, use "ConcurrentHashMap".
@ -218,7 +220,7 @@ public class PublicationsRetrieverPlugin {
// Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
boolean foundAlreadyDownloadedFullText = false;
boolean foundIDUrlInWorkerReport = false;
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList )
for ( DataForOutput data_2 : FileUtils.dataForOutput )
{
if ( ! (data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))) )
continue;
@ -237,7 +239,7 @@ public class PublicationsRetrieverPlugin {
if ( tempFileLocation.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) || tempFileLocation.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) )
continue;
// At this point we have found that another instance of the same record gives the docFile itself, not a reference to it.
// At this point we have found that another instance of the same record gives the docFile itself, not a reference to it, nor we have a problematic case.
fileLocation = tempFileLocation;
size = data_2.getSize();
hash = data_2.getHash();
@ -255,7 +257,7 @@ public class PublicationsRetrieverPlugin {
else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error.
fileLocation = comment; // This is the full-file-path.
mimeType = "application/pdf";
} else { // Else the file was not retrieved, so all file-related data are kept "null".
} else { // Else the file was not retrieved, so the file-related data is kept "null".
numOfUnretrievedFiles ++;
error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future.
}
@ -290,7 +292,7 @@ public class PublicationsRetrieverPlugin {
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
}// end-for
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
FileUtils.dataForOutput.clear(); // Empty the list, to be re-populated by the next batch / assignment.
if ( numOfUnretrievedFiles > 50 )
logger.warn("The number of non-retrieved files is: " + numOfUnretrievedFiles);