- Process the Error of PDF-aggregation. Distinguish between "couldRetry" and "noRetry" cases.

- Add a "test"-switch in order to easily switch between test and normal mode.
- Fix an NPE, when requesting for the "AssignmentRequest".
- Upgrade the "installPublicationsRetriever.sh" to "installAndRun.sh", which takes care of everything.
- Define the newest SpringBoot-version in "build.gradle".
- Code cleanup.
This commit is contained in:
Lampros Smyrnaios 2021-08-05 15:09:28 +03:00
parent 6cc2673fca
commit 62ce7ee4a5
6 changed files with 101 additions and 31 deletions

View File

@ -1,5 +1,6 @@
buildscript { buildscript {
ext { ext {
springBootVersion = "2.5.3"
springSecurityVersion = "5.5.1" springSecurityVersion = "5.5.1"
} }
} }
@ -24,11 +25,11 @@ repositories {
dependencies { dependencies {
runtimeOnly 'org.springframework.boot:spring-boot-devtools' runtimeOnly "org.springframework.boot:spring-boot-devtools:${springBootVersion}"
implementation 'org.springframework.boot:spring-boot-starter-web' implementation("org.springframework.boot:spring-boot-starter-web:${springBootVersion}")
implementation("org.springframework.boot:spring-boot-starter-security") implementation("org.springframework.boot:spring-boot-starter-security:${springBootVersion}")
implementation("org.springframework.boot:spring-boot-configuration-processor") implementation("org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}")
implementation("org.springframework.security:spring-security-core:${springSecurityVersion}") implementation("org.springframework.security:spring-security-core:${springSecurityVersion}")
implementation("org.springframework.security:spring-security-web:${springSecurityVersion}") implementation("org.springframework.security:spring-security-web:${springSecurityVersion}")
implementation("org.springframework.security:spring-security-config:${springSecurityVersion}") implementation("org.springframework.security:spring-security-config:${springSecurityVersion}")
@ -44,7 +45,7 @@ dependencies {
} }
testImplementation group: 'org.springframework.security', name: 'spring-security-test', version: springSecurityVersion testImplementation group: 'org.springframework.security', name: 'spring-security-test', version: springSecurityVersion
testImplementation 'org.springframework.boot:spring-boot-starter-test' testImplementation "org.springframework.boot:spring-boot-starter-test:${springBootVersion}"
} }
test { test {

View File

@ -15,3 +15,23 @@ cp target/publications_retriever-1.0-SNAPSHOT.jar ../publications_retriever-1.0-
# Delete the directory with the source-code. # Delete the directory with the source-code.
cd ../ && rm -rf PublicationsRetriever cd ../ && rm -rf PublicationsRetriever
# Clean and (re)build and run the project.
cd ../
echo -e "\nAsking for sudo, in order to verify the installation of 'gradle'..\n"
gradleVersion="7.1.1"
wget https://services.gradle.org/distributions/gradle-${gradleVersion}-bin.zip
sudo mkdir /opt/gradle
sudo unzip -d /opt/gradle gradle-${gradleVersion}-bin.zip
ls /opt/gradle/gradle-${gradleVersion}
export PATH=$PATH:/opt/gradle/gradle-${gradleVersion}/bin
gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin
gradle tasks
gradle -v
gradle clean
gradle build
gradle bootRun

View File

@ -11,18 +11,27 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
}) })
public class Error { public class Error {
public enum ErrorType {
couldRetry, noRetry
}
@JsonProperty("type") @JsonProperty("type")
private String type; private ErrorType type;
@JsonProperty("message") @JsonProperty("message")
private String message; private String message;
public String getType() { public Error(ErrorType type, String message) {
this.type = type;
this.message = message;
}
public ErrorType getType() {
return type; return type;
} }
public void setType(String type) { public void setType(ErrorType type) {
type = type; this.type = type;
} }
public String getMessage() { public String getMessage() {
@ -36,7 +45,7 @@ public class Error {
@Override @Override
public String toString() { public String toString() {
return "Error{" + return "Error{" +
"type='" + type + '\'' + "type=" + type +
", message='" + message + '\'' + ", message='" + message + '\'' +
'}'; '}';
} }

View File

@ -8,7 +8,8 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
@JsonInclude(JsonInclude.Include.NON_NULL) @JsonInclude(JsonInclude.Include.NON_NULL)
@JsonPropertyOrder({ @JsonPropertyOrder({
"status", "status",
"payload" "payload",
"error"
}) })
public class UrlReport { public class UrlReport {
@ -18,10 +19,14 @@ public class UrlReport {
@JsonProperty("payload") @JsonProperty("payload")
private Payload payload; private Payload payload;
@JsonProperty("error")
private Error error;
public UrlReport(String status, Payload payload) {
public UrlReport(String status, Payload payload, Error error) {
this.status = status; this.status = status;
this.payload = payload; this.payload = payload;
this.error = error;
} }
@ -41,11 +46,20 @@ public class UrlReport {
this.payload = payload; this.payload = payload;
} }
public Error getError() {
return error;
}
public void setError(Error error) {
this.error = error;
}
@Override @Override
public String toString() { public String toString() {
return "UrlReport{" + return "UrlReport{" +
"status='" + status + '\'' + "status='" + status + '\'' +
", payload=" + payload + ", payload=" + payload +
", error=" + error +
'}'; '}';
} }
} }

View File

@ -4,6 +4,7 @@ import com.google.common.hash.Hashing;
import com.google.common.io.Files; import com.google.common.io.Files;
import edu.uci.ics.crawler4j.url.URLCanonicalizer; import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.S3ObjectStoreMinIO; import eu.openaire.publications_retriever.util.file.S3ObjectStoreMinIO;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
@ -12,6 +13,7 @@ import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils; import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.models.Assignment; import eu.openaire.urls_worker.models.Assignment;
import eu.openaire.urls_worker.models.Error;
import eu.openaire.urls_worker.models.Payload; import eu.openaire.urls_worker.models.Payload;
import eu.openaire.urls_worker.models.UrlReport; import eu.openaire.urls_worker.models.UrlReport;
import eu.openaire.urls_worker.util.AssignmentHandler; import eu.openaire.urls_worker.util.AssignmentHandler;
@ -91,7 +93,7 @@ public class PublicationsRetrieverPlugin {
String sourceUrl = urlToCheck; // Hold it here for the logging-messages. String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) { if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
logger.warn("Could not canonicalize url: " + sourceUrl); logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false"); UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
LoaderAndChecker.connProblematicUrls.incrementAndGet(); LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false; return false;
} }
@ -114,12 +116,17 @@ public class PublicationsRetrieverPlugin {
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl); HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) { } catch (Exception e) {
String wasUrlValid = "true"; String wasUrlValid = "true";
String couldRetry = "false";
if ( e instanceof RuntimeException ) { if ( e instanceof RuntimeException ) {
String message = e.getMessage(); String message = e.getMessage();
if ( (message != null) && message.contains("HTTP 404 Client Error") ) if ( message != null) {
wasUrlValid = "false"; if ( message.contains("HTTP 404 Client Error") )
wasUrlValid = "false";
else if ( message.contains("Server Error") || message.contains("HTTP 408") )
couldRetry = "true"; // We could retry at a later time, as the HTTP-non-404-errors can be temporal.
}
} }
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false"); UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
} }
return true; return true;
}); });
@ -135,19 +142,23 @@ public class PublicationsRetrieverPlugin {
} }
private static final String DocFileNotRetrievedExceptionName = DocFileNotRetrievedException.class.getName(); // Keep it here for easily spot if the exception changes inside the PublicationsRetriever library.
public static void addUrlReportsToWorkerReport() public static void addUrlReportsToWorkerReport()
{ {
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList ) for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
{ {
String status = null, fileLocation = null, hash = null; String status = null, fileLocation = null, hash = null;
Long size = null; Long size = null;
Error error = null;
String comment = data.getComment();
if ( data.getWasDocumentOrDatasetAccessible().equals("true") ) if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
{ {
status = "accessible"; status = "accessible";
fileLocation = data.getComment(); if ( comment.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
// The file of this docUrl was already downloaded by another docUrl. // The file of this docUrl was already downloaded by another docUrl.
String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1); String previousId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
//logger.debug("previousId: " + previousId); // DEBUG! //logger.debug("previousId: " + previousId); // DEBUG!
// Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location. // Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) { for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
@ -157,10 +168,10 @@ public class PublicationsRetrieverPlugin {
} }
} }
} }
else if ( fileLocation.contains("DocFileNotRetrievedException") ) else if ( comment.contains(DocFileNotRetrievedExceptionName) )
fileLocation = null; fileLocation = "File not retrieved";
else {
if ( fileLocation != null ) { fileLocation = comment;
try { try {
File docFile = new File(fileLocation); File docFile = new File(fileLocation);
if ( docFile.isFile() ) { if ( docFile.isFile() ) {
@ -179,13 +190,23 @@ public class PublicationsRetrieverPlugin {
e.printStackTrace(); e.printStackTrace();
} }
} else }
fileLocation = "File not retrieved"; } else {
} else
status = "non-accessible"; status = "non-accessible";
if ( data.getCouldRetry().equals("true") )
error = new Error(Error.ErrorType.couldRetry, comment);
else
error = new Error(Error.ErrorType.noRetry, comment);
}
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever"); String docOrDatasetUrl = data.getDocOrDatasetUrl();
AssignmentHandler.urlReports.add(new UrlReport(status, payload)); if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
docOrDatasetUrl = null;
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
} }
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment. FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
} }
@ -195,7 +216,7 @@ public class PublicationsRetrieverPlugin {
try { try {
return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures. return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) { } catch (Exception e) {
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false"); UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false");
return false; return false;
} }
} }

View File

@ -27,11 +27,13 @@ public class AssignmentHandler {
public static List<UrlReport> urlReports = null; public static List<UrlReport> urlReports = null;
private static final boolean askForTest = true; // Enable this only for testing.
public static AssignmentRequest requestAssignments() public static AssignmentRequest requestAssignments()
{ {
RestTemplate restTemplate = new RestTemplateBuilder().build(); RestTemplate restTemplate = new RestTemplateBuilder().build();
String url = "http://localhost:1880/api/urls/test?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT; String url = "http://localhost:1880/api/urls" + (askForTest ? "/test" : "") + "?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT;
String json = null; String json = null;
try { try {
json = restTemplate.getForObject(url, String.class); json = restTemplate.getForObject(url, String.class);
@ -58,7 +60,6 @@ public class AssignmentHandler {
//logger.debug(assignmentRequest.toString()); // DEBUG! //logger.debug(assignmentRequest.toString()); // DEBUG!
logger.info("AssignmentRequest < " + assignmentRequest.getAssignmentCounter() + " > was received and it's ready to be processed. It contains " + assignmentRequest.getAssignments().size() + " tasks."); logger.info("AssignmentRequest < " + assignmentRequest.getAssignmentCounter() + " > was received and it's ready to be processed. It contains " + assignmentRequest.getAssignments().size() + " tasks.");
// TODO - Maybe create a HashSet with these IDs. It may be useful for the Worker to know and report which assignments (and how many) it has processed.
return assignmentRequest; return assignmentRequest;
} }
@ -67,6 +68,10 @@ public class AssignmentHandler {
public static void handleAssignments() public static void handleAssignments()
{ {
AssignmentRequest assignmentRequest = requestAssignments(); AssignmentRequest assignmentRequest = requestAssignments();
if ( assignmentRequest == null ) {
logger.error("The \"assignmentRequest\" was \"null\"!");
return;
}
Long assignmentRequestCounter = assignmentRequest.getAssignmentCounter(); Long assignmentRequestCounter = assignmentRequest.getAssignmentCounter();