- Calculate and set the max heap size with respect to the system resources, in "installAndRun.sh".

- Fix not setting the right "Error"-members when the docUrl was found, but the full-text was not retrieved.
- Set a "couldRetry"-indication in the "Error"-class, when the full-text was retrieved, as, in general, it could be retried to give the same successful result.
- Update the "docFileNotRetrieved"-check to use the standardized string.
- Eliminate some possible NPEs.
- Update Gradle.
Lampros Smyrnaios 2 years ago
parent 0db35a83e7
commit 82d69f3bf5

@ -8,7 +8,6 @@ group = 'eu.openaire.urls_worker'
version = '1.0.0-SNAPSHOT'
sourceCompatibility = '1.8'
repositories {
flatDir {
@ -16,7 +15,6 @@ repositories {
dependencies {
runtimeOnly "org.springframework.boot:spring-boot-devtools"

@ -1,5 +1,5 @@

@ -28,7 +28,7 @@ if [[ ! -f $inputDataFile ]]; then
echo -e "\n\n"
if [[ justInstall -eq 0 ]]; then
@ -63,6 +63,13 @@ if [[ justInstall -eq 0 ]]; then
export PATH=$PATH:/opt/gradle/gradle-${gradleVersion}/bin
# Update the max-heap-size based on the machine's physical memory.
machine_memory_mb=$(grep MemTotal /proc/meminfo | awk '{print $2}' | xargs -I {} echo "scale=4; {}/1024" | bc) # It returns the size in MB.
max_heap_size_mb=$(echo "($machine_memory_mb - 768)/1" | bc) # Leave 768 MB to the system (the "()/1" is used to take the floor value).
# Now, we replace the "-Xmx" parameter inside the "./build.gradle" file, with "-Xmx${max_heap_size}m"
echo -e "\n\nThe max-heap-size (-Xmx) will be set to: ${max_heap_size_mb}m\n\n"
sed -i "s/'-Xmx[0-9]\+[gm]'/'-Xmx${max_heap_size_mb}m'/g" ./build.gradle
gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin
#gradle tasks # For debugging installation

@ -2,7 +2,6 @@ package eu.openaire.urls_worker.plugins;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
@ -136,8 +135,6 @@ public class PublicationsRetrieverPlugin {
private static final String DocFileNotRetrievedExceptionName = DocFileNotRetrievedException.class.getSimpleName(); // Keep it here for easily spot if the exception changes inside the PublicationsRetriever library.
public static void addUrlReportsToWorkerReport()
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
@ -149,7 +146,7 @@ public class PublicationsRetrieverPlugin {
Long size = data.getSize();
Error error = null;
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE.
status = UrlReport.StatusType.accessible;
if ( comment.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
@ -168,15 +165,18 @@ public class PublicationsRetrieverPlugin {
// TODO - The case where the "twin-ID" is not found, should "never" happen. But should we check? How to handle if that is the case..?
else if ( ! comment.contains(DocFileNotRetrievedExceptionName) ) { // If it was downloaded without an error.
else if ( ! comment.equals(HttpConnUtils.docFileNotRetrievedMessage) ) { // If it was downloaded without an error.
fileLocation = comment; // This is the full-file-path.
mimeType = "application/pdf";
error = new Error(null, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
} else // Else the file was not retrieved, so all file-related data are kept "null".
error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it in the future.
if ( error == null ) // If the file was retrieved, in any time.
error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
else {
status = UrlReport.StatusType.non_accessible;
if ( data.getCouldRetry().equals("true") )
if ( "true".equals(data.getCouldRetry()) )
error = new Error(Error.ErrorType.couldRetry, comment);
error = new Error(Error.ErrorType.noRetry, comment);
