- Process the Error of PDF-aggregation. Distinguish between "couldRetry" and "noRetry" cases.

- Add a "test"-switch in order to easily switch between test and normal mode.
- Fix an NPE, when requesting for the "AssignmentRequest".
- Upgrade the "installPublicationsRetriever.sh" to "installAndRun.sh", which takes care of everything.
- Define the newest SpringBoot-version in "build.gradle".
- Code cleanup.
This commit is contained in:
Lampros Smyrnaios 2021-08-05 15:09:28 +03:00
parent 6cc2673fca
commit 62ce7ee4a5
6 changed files with 101 additions and 31 deletions

View File

@ -1,5 +1,6 @@
buildscript {
ext {
springBootVersion = "2.5.3"
springSecurityVersion = "5.5.1"
}
}
@ -24,11 +25,11 @@ repositories {
dependencies {
runtimeOnly 'org.springframework.boot:spring-boot-devtools'
runtimeOnly "org.springframework.boot:spring-boot-devtools:${springBootVersion}"
implementation 'org.springframework.boot:spring-boot-starter-web'
implementation("org.springframework.boot:spring-boot-starter-security")
implementation("org.springframework.boot:spring-boot-configuration-processor")
implementation("org.springframework.boot:spring-boot-starter-web:${springBootVersion}")
implementation("org.springframework.boot:spring-boot-starter-security:${springBootVersion}")
implementation("org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}")
implementation("org.springframework.security:spring-security-core:${springSecurityVersion}")
implementation("org.springframework.security:spring-security-web:${springSecurityVersion}")
implementation("org.springframework.security:spring-security-config:${springSecurityVersion}")
@ -44,7 +45,7 @@ dependencies {
}
testImplementation group: 'org.springframework.security', name: 'spring-security-test', version: springSecurityVersion
testImplementation 'org.springframework.boot:spring-boot-starter-test'
testImplementation "org.springframework.boot:spring-boot-starter-test:${springBootVersion}"
}
test {

View File

@ -15,3 +15,23 @@ cp target/publications_retriever-1.0-SNAPSHOT.jar ../publications_retriever-1.0-
# Delete the directory with the source-code.
cd ../ && rm -rf PublicationsRetriever
# Clean and (re)build and run the project.
cd ../
echo -e "\nAsking for sudo, in order to verify the installation of 'gradle'..\n"
gradleVersion="7.1.1"
wget https://services.gradle.org/distributions/gradle-${gradleVersion}-bin.zip
sudo mkdir /opt/gradle
sudo unzip -d /opt/gradle gradle-${gradleVersion}-bin.zip
ls /opt/gradle/gradle-${gradleVersion}
export PATH=$PATH:/opt/gradle/gradle-${gradleVersion}/bin
gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin
gradle tasks
gradle -v
gradle clean
gradle build
gradle bootRun

View File

@ -11,18 +11,27 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
})
public class Error {
public enum ErrorType {
couldRetry, noRetry
}
@JsonProperty("type")
private String type;
private ErrorType type;
@JsonProperty("message")
private String message;
public String getType() {
public Error(ErrorType type, String message) {
this.type = type;
this.message = message;
}
public ErrorType getType() {
return type;
}
public void setType(String type) {
type = type;
public void setType(ErrorType type) {
this.type = type;
}
public String getMessage() {
@ -36,7 +45,7 @@ public class Error {
@Override
public String toString() {
return "Error{" +
"type='" + type + '\'' +
"type=" + type +
", message='" + message + '\'' +
'}';
}

View File

@ -8,7 +8,8 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
@JsonInclude(JsonInclude.Include.NON_NULL)
@JsonPropertyOrder({
"status",
"payload"
"payload",
"error"
})
public class UrlReport {
@ -18,10 +19,14 @@ public class UrlReport {
@JsonProperty("payload")
private Payload payload;
@JsonProperty("error")
private Error error;
public UrlReport(String status, Payload payload) {
public UrlReport(String status, Payload payload, Error error) {
this.status = status;
this.payload = payload;
this.error = error;
}
@ -41,11 +46,20 @@ public class UrlReport {
this.payload = payload;
}
public Error getError() {
return error;
}
public void setError(Error error) {
this.error = error;
}
@Override
public String toString() {
return "UrlReport{" +
"status='" + status + '\'' +
", payload=" + payload +
", error=" + error +
'}';
}
}

View File

@ -4,6 +4,7 @@ import com.google.common.hash.Hashing;
import com.google.common.io.Files;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.S3ObjectStoreMinIO;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
@ -12,6 +13,7 @@ import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.models.Assignment;
import eu.openaire.urls_worker.models.Error;
import eu.openaire.urls_worker.models.Payload;
import eu.openaire.urls_worker.models.UrlReport;
import eu.openaire.urls_worker.util.AssignmentHandler;
@ -91,7 +93,7 @@ public class PublicationsRetrieverPlugin {
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false");
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
@ -114,12 +116,17 @@ public class PublicationsRetrieverPlugin {
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) {
String wasUrlValid = "true";
String couldRetry = "false";
if ( e instanceof RuntimeException ) {
String message = e.getMessage();
if ( (message != null) && message.contains("HTTP 404 Client Error") )
wasUrlValid = "false";
if ( message != null) {
if ( message.contains("HTTP 404 Client Error") )
wasUrlValid = "false";
else if ( message.contains("Server Error") || message.contains("HTTP 408") )
couldRetry = "true"; // We could retry at a later time, as the HTTP-non-404-errors can be temporal.
}
}
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false");
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
}
return true;
});
@ -135,19 +142,23 @@ public class PublicationsRetrieverPlugin {
}
private static final String DocFileNotRetrievedExceptionName = DocFileNotRetrievedException.class.getName(); // Keep it here for easily spot if the exception changes inside the PublicationsRetriever library.
public static void addUrlReportsToWorkerReport()
{
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
{
String status = null, fileLocation = null, hash = null;
Long size = null;
Error error = null;
String comment = data.getComment();
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
{
status = "accessible";
fileLocation = data.getComment();
if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
if ( comment.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
// The file of this docUrl was already downloaded by another docUrl.
String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
String previousId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
//logger.debug("previousId: " + previousId); // DEBUG!
// Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
@ -157,10 +168,10 @@ public class PublicationsRetrieverPlugin {
}
}
}
else if ( fileLocation.contains("DocFileNotRetrievedException") )
fileLocation = null;
if ( fileLocation != null ) {
else if ( comment.contains(DocFileNotRetrievedExceptionName) )
fileLocation = "File not retrieved";
else {
fileLocation = comment;
try {
File docFile = new File(fileLocation);
if ( docFile.isFile() ) {
@ -179,13 +190,23 @@ public class PublicationsRetrieverPlugin {
e.printStackTrace();
}
} else
fileLocation = "File not retrieved";
} else
}
} else {
status = "non-accessible";
if ( data.getCouldRetry().equals("true") )
error = new Error(Error.ErrorType.couldRetry, comment);
else
error = new Error(Error.ErrorType.noRetry, comment);
}
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
AssignmentHandler.urlReports.add(new UrlReport(status, payload));
String docOrDatasetUrl = data.getDocOrDatasetUrl();
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
docOrDatasetUrl = null;
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
}
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
}
@ -195,7 +216,7 @@ public class PublicationsRetrieverPlugin {
try {
return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) {
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false");
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false");
return false;
}
}

View File

@ -27,11 +27,13 @@ public class AssignmentHandler {
public static List<UrlReport> urlReports = null;
private static final boolean askForTest = true; // Enable this only for testing.
public static AssignmentRequest requestAssignments()
{
RestTemplate restTemplate = new RestTemplateBuilder().build();
String url = "http://localhost:1880/api/urls/test?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT;
String url = "http://localhost:1880/api/urls" + (askForTest ? "/test" : "") + "?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT;
String json = null;
try {
json = restTemplate.getForObject(url, String.class);
@ -58,7 +60,6 @@ public class AssignmentHandler {
//logger.debug(assignmentRequest.toString()); // DEBUG!
logger.info("AssignmentRequest < " + assignmentRequest.getAssignmentCounter() + " > was received and it's ready to be processed. It contains " + assignmentRequest.getAssignments().size() + " tasks.");
// TODO - Maybe create a HashSet with these IDs. It may be useful for the Worker to know and report which assignments (and how many) it has processed.
return assignmentRequest;
}
@ -67,6 +68,10 @@ public class AssignmentHandler {
public static void handleAssignments()
{
AssignmentRequest assignmentRequest = requestAssignments();
if ( assignmentRequest == null ) {
logger.error("The \"assignmentRequest\" was \"null\"!");
return;
}
Long assignmentRequestCounter = assignmentRequest.getAssignmentCounter();