diff --git a/build.gradle b/build.gradle index cc9bf21..79739fc 100644 --- a/build.gradle +++ b/build.gradle @@ -1,5 +1,6 @@ buildscript { ext { + springBootVersion = "2.5.3" springSecurityVersion = "5.5.1" } } @@ -24,11 +25,11 @@ repositories { dependencies { - runtimeOnly 'org.springframework.boot:spring-boot-devtools' + runtimeOnly "org.springframework.boot:spring-boot-devtools:${springBootVersion}" - implementation 'org.springframework.boot:spring-boot-starter-web' - implementation("org.springframework.boot:spring-boot-starter-security") - implementation("org.springframework.boot:spring-boot-configuration-processor") + implementation("org.springframework.boot:spring-boot-starter-web:${springBootVersion}") + implementation("org.springframework.boot:spring-boot-starter-security:${springBootVersion}") + implementation("org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}") implementation("org.springframework.security:spring-security-core:${springSecurityVersion}") implementation("org.springframework.security:spring-security-web:${springSecurityVersion}") implementation("org.springframework.security:spring-security-config:${springSecurityVersion}") @@ -44,7 +45,7 @@ dependencies { } testImplementation group: 'org.springframework.security', name: 'spring-security-test', version: springSecurityVersion - testImplementation 'org.springframework.boot:spring-boot-starter-test' + testImplementation "org.springframework.boot:spring-boot-starter-test:${springBootVersion}" } test { diff --git a/installPublicationsRetriever.sh b/installAndRun.sh similarity index 60% rename from installPublicationsRetriever.sh rename to installAndRun.sh index 457ec13..0e51202 100755 --- a/installPublicationsRetriever.sh +++ b/installAndRun.sh @@ -15,3 +15,23 @@ cp target/publications_retriever-1.0-SNAPSHOT.jar ../publications_retriever-1.0- # Delete the directory with the source-code. cd ../ && rm -rf PublicationsRetriever + +# Clean and (re)build and run the project. +cd ../ +echo -e "\nAsking for sudo, in order to verify the installation of 'gradle'..\n" + +gradleVersion="7.1.1" + +wget https://services.gradle.org/distributions/gradle-${gradleVersion}-bin.zip + +sudo mkdir /opt/gradle +sudo unzip -d /opt/gradle gradle-${gradleVersion}-bin.zip +ls /opt/gradle/gradle-${gradleVersion} + +export PATH=$PATH:/opt/gradle/gradle-${gradleVersion}/bin +gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin +gradle tasks +gradle -v +gradle clean +gradle build +gradle bootRun diff --git a/src/main/java/eu/openaire/urls_worker/models/Error.java b/src/main/java/eu/openaire/urls_worker/models/Error.java index f01830a..66b35eb 100644 --- a/src/main/java/eu/openaire/urls_worker/models/Error.java +++ b/src/main/java/eu/openaire/urls_worker/models/Error.java @@ -11,18 +11,27 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder; }) public class Error { + public enum ErrorType { + couldRetry, noRetry + } + @JsonProperty("type") - private String type; + private ErrorType type; @JsonProperty("message") private String message; - public String getType() { + public Error(ErrorType type, String message) { + this.type = type; + this.message = message; + } + + public ErrorType getType() { return type; } - public void setType(String type) { - type = type; + public void setType(ErrorType type) { + this.type = type; } public String getMessage() { @@ -36,7 +45,7 @@ public class Error { @Override public String toString() { return "Error{" + - "type='" + type + '\'' + + "type=" + type + ", message='" + message + '\'' + '}'; } diff --git a/src/main/java/eu/openaire/urls_worker/models/UrlReport.java b/src/main/java/eu/openaire/urls_worker/models/UrlReport.java index e40460c..abe7a1e 100644 --- a/src/main/java/eu/openaire/urls_worker/models/UrlReport.java +++ b/src/main/java/eu/openaire/urls_worker/models/UrlReport.java @@ -8,7 +8,8 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder; @JsonInclude(JsonInclude.Include.NON_NULL) @JsonPropertyOrder({ "status", - "payload" + "payload", + "error" }) public class UrlReport { @@ -18,10 +19,14 @@ public class UrlReport { @JsonProperty("payload") private Payload payload; + @JsonProperty("error") + private Error error; - public UrlReport(String status, Payload payload) { + + public UrlReport(String status, Payload payload, Error error) { this.status = status; this.payload = payload; + this.error = error; } @@ -41,11 +46,20 @@ public class UrlReport { this.payload = payload; } + public Error getError() { + return error; + } + + public void setError(Error error) { + this.error = error; + } + @Override public String toString() { return "UrlReport{" + "status='" + status + '\'' + ", payload=" + payload + + ", error=" + error + '}'; } } diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index 82c71d2..7d1f473 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -4,6 +4,7 @@ import com.google.common.hash.Hashing; import com.google.common.io.Files; import edu.uci.ics.crawler4j.url.URLCanonicalizer; import eu.openaire.publications_retriever.PublicationsRetriever; +import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.file.S3ObjectStoreMinIO; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; @@ -12,6 +13,7 @@ import eu.openaire.publications_retriever.util.url.DataToBeLogged; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; import eu.openaire.urls_worker.models.Assignment; +import eu.openaire.urls_worker.models.Error; import eu.openaire.urls_worker.models.Payload; import eu.openaire.urls_worker.models.UrlReport; import eu.openaire.urls_worker.util.AssignmentHandler; @@ -91,7 +93,7 @@ public class PublicationsRetrieverPlugin { String sourceUrl = urlToCheck; // Hold it here for the logging-messages. if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) { logger.warn("Could not canonicalize url: " + sourceUrl); - UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false"); + UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false"); LoaderAndChecker.connProblematicUrls.incrementAndGet(); return false; } @@ -114,12 +116,17 @@ public class PublicationsRetrieverPlugin { HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl); } catch (Exception e) { String wasUrlValid = "true"; + String couldRetry = "false"; if ( e instanceof RuntimeException ) { String message = e.getMessage(); - if ( (message != null) && message.contains("HTTP 404 Client Error") ) - wasUrlValid = "false"; + if ( message != null) { + if ( message.contains("HTTP 404 Client Error") ) + wasUrlValid = "false"; + else if ( message.contains("Server Error") || message.contains("HTTP 408") ) + couldRetry = "true"; // We could retry at a later time, as the HTTP-non-404-errors can be temporal. + } } - UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false"); + UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry); } return true; }); @@ -135,19 +142,23 @@ public class PublicationsRetrieverPlugin { } + private static final String DocFileNotRetrievedExceptionName = DocFileNotRetrievedException.class.getName(); // Keep it here for easily spot if the exception changes inside the PublicationsRetriever library. + public static void addUrlReportsToWorkerReport() { for ( DataToBeLogged data : FileUtils.dataToBeLoggedList ) { String status = null, fileLocation = null, hash = null; Long size = null; + Error error = null; + String comment = data.getComment(); + if ( data.getWasDocumentOrDatasetAccessible().equals("true") ) { status = "accessible"; - fileLocation = data.getComment(); - if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) { + if ( comment.contains(UrlUtils.alreadyDownloadedByIDMessage) ) { // The file of this docUrl was already downloaded by another docUrl. - String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1); + String previousId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1); //logger.debug("previousId: " + previousId); // DEBUG! // Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location. for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) { @@ -157,10 +168,10 @@ public class PublicationsRetrieverPlugin { } } } - else if ( fileLocation.contains("DocFileNotRetrievedException") ) - fileLocation = null; - - if ( fileLocation != null ) { + else if ( comment.contains(DocFileNotRetrievedExceptionName) ) + fileLocation = "File not retrieved"; + else { + fileLocation = comment; try { File docFile = new File(fileLocation); if ( docFile.isFile() ) { @@ -179,13 +190,23 @@ public class PublicationsRetrieverPlugin { e.printStackTrace(); } - } else - fileLocation = "File not retrieved"; - } else + } + } else { status = "non-accessible"; + if ( data.getCouldRetry().equals("true") ) + error = new Error(Error.ErrorType.couldRetry, comment); + else + error = new Error(Error.ErrorType.noRetry, comment); + } - Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever"); - AssignmentHandler.urlReports.add(new UrlReport(status, payload)); + String docOrDatasetUrl = data.getDocOrDatasetUrl(); + if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) ) + docOrDatasetUrl = null; + + Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever"); + // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified. + + AssignmentHandler.urlReports.add(new UrlReport(status, payload, error)); } FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment. } @@ -195,7 +216,7 @@ public class PublicationsRetrieverPlugin { try { return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures. } catch (Exception e) { - UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false"); + UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false"); return false; } } diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java index ec3f726..c2dab55 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java @@ -27,11 +27,13 @@ public class AssignmentHandler { public static List urlReports = null; + private static final boolean askForTest = true; // Enable this only for testing. + public static AssignmentRequest requestAssignments() { RestTemplate restTemplate = new RestTemplateBuilder().build(); - String url = "http://localhost:1880/api/urls/test?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT; + String url = "http://localhost:1880/api/urls" + (askForTest ? "/test" : "") + "?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT; String json = null; try { json = restTemplate.getForObject(url, String.class); @@ -58,7 +60,6 @@ public class AssignmentHandler { //logger.debug(assignmentRequest.toString()); // DEBUG! logger.info("AssignmentRequest < " + assignmentRequest.getAssignmentCounter() + " > was received and it's ready to be processed. It contains " + assignmentRequest.getAssignments().size() + " tasks."); - // TODO - Maybe create a HashSet with these IDs. It may be useful for the Worker to know and report which assignments (and how many) it has processed. return assignmentRequest; } @@ -67,6 +68,10 @@ public class AssignmentHandler { public static void handleAssignments() { AssignmentRequest assignmentRequest = requestAssignments(); + if ( assignmentRequest == null ) { + logger.error("The \"assignmentRequest\" was \"null\"!"); + return; + } Long assignmentRequestCounter = assignmentRequest.getAssignmentCounter();