- Process the Error of PDF-aggregation. Distinguish between "couldRetry" and "noRetry" cases.
- Add a "test"-switch in order to easily switch between test and normal mode. - Fix an NPE, when requesting for the "AssignmentRequest". - Upgrade the "installPublicationsRetriever.sh" to "installAndRun.sh", which takes care of everything. - Define the newest SpringBoot-version in "build.gradle". - Code cleanup.
This commit is contained in:
parent
6cc2673fca
commit
62ce7ee4a5
11
build.gradle
11
build.gradle
|
@ -1,5 +1,6 @@
|
|||
buildscript {
|
||||
ext {
|
||||
springBootVersion = "2.5.3"
|
||||
springSecurityVersion = "5.5.1"
|
||||
}
|
||||
}
|
||||
|
@ -24,11 +25,11 @@ repositories {
|
|||
|
||||
|
||||
dependencies {
|
||||
runtimeOnly 'org.springframework.boot:spring-boot-devtools'
|
||||
runtimeOnly "org.springframework.boot:spring-boot-devtools:${springBootVersion}"
|
||||
|
||||
implementation 'org.springframework.boot:spring-boot-starter-web'
|
||||
implementation("org.springframework.boot:spring-boot-starter-security")
|
||||
implementation("org.springframework.boot:spring-boot-configuration-processor")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:${springBootVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-security:${springBootVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}")
|
||||
implementation("org.springframework.security:spring-security-core:${springSecurityVersion}")
|
||||
implementation("org.springframework.security:spring-security-web:${springSecurityVersion}")
|
||||
implementation("org.springframework.security:spring-security-config:${springSecurityVersion}")
|
||||
|
@ -44,7 +45,7 @@ dependencies {
|
|||
}
|
||||
|
||||
testImplementation group: 'org.springframework.security', name: 'spring-security-test', version: springSecurityVersion
|
||||
testImplementation 'org.springframework.boot:spring-boot-starter-test'
|
||||
testImplementation "org.springframework.boot:spring-boot-starter-test:${springBootVersion}"
|
||||
}
|
||||
|
||||
test {
|
||||
|
|
|
@ -15,3 +15,23 @@ cp target/publications_retriever-1.0-SNAPSHOT.jar ../publications_retriever-1.0-
|
|||
|
||||
# Delete the directory with the source-code.
|
||||
cd ../ && rm -rf PublicationsRetriever
|
||||
|
||||
# Clean and (re)build and run the project.
|
||||
cd ../
|
||||
echo -e "\nAsking for sudo, in order to verify the installation of 'gradle'..\n"
|
||||
|
||||
gradleVersion="7.1.1"
|
||||
|
||||
wget https://services.gradle.org/distributions/gradle-${gradleVersion}-bin.zip
|
||||
|
||||
sudo mkdir /opt/gradle
|
||||
sudo unzip -d /opt/gradle gradle-${gradleVersion}-bin.zip
|
||||
ls /opt/gradle/gradle-${gradleVersion}
|
||||
|
||||
export PATH=$PATH:/opt/gradle/gradle-${gradleVersion}/bin
|
||||
gradle wrapper --gradle-version=${gradleVersion} --distribution-type=bin
|
||||
gradle tasks
|
||||
gradle -v
|
||||
gradle clean
|
||||
gradle build
|
||||
gradle bootRun
|
|
@ -11,18 +11,27 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
|
|||
})
|
||||
public class Error {
|
||||
|
||||
public enum ErrorType {
|
||||
couldRetry, noRetry
|
||||
}
|
||||
|
||||
@JsonProperty("type")
|
||||
private String type;
|
||||
private ErrorType type;
|
||||
|
||||
@JsonProperty("message")
|
||||
private String message;
|
||||
|
||||
public String getType() {
|
||||
public Error(ErrorType type, String message) {
|
||||
this.type = type;
|
||||
this.message = message;
|
||||
}
|
||||
|
||||
public ErrorType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
type = type;
|
||||
public void setType(ErrorType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
|
@ -36,7 +45,7 @@ public class Error {
|
|||
@Override
|
||||
public String toString() {
|
||||
return "Error{" +
|
||||
"type='" + type + '\'' +
|
||||
"type=" + type +
|
||||
", message='" + message + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
|
|
@ -8,7 +8,8 @@ import com.fasterxml.jackson.annotation.JsonPropertyOrder;
|
|||
@JsonInclude(JsonInclude.Include.NON_NULL)
|
||||
@JsonPropertyOrder({
|
||||
"status",
|
||||
"payload"
|
||||
"payload",
|
||||
"error"
|
||||
})
|
||||
public class UrlReport {
|
||||
|
||||
|
@ -18,10 +19,14 @@ public class UrlReport {
|
|||
@JsonProperty("payload")
|
||||
private Payload payload;
|
||||
|
||||
@JsonProperty("error")
|
||||
private Error error;
|
||||
|
||||
public UrlReport(String status, Payload payload) {
|
||||
|
||||
public UrlReport(String status, Payload payload, Error error) {
|
||||
this.status = status;
|
||||
this.payload = payload;
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
|
||||
|
@ -41,11 +46,20 @@ public class UrlReport {
|
|||
this.payload = payload;
|
||||
}
|
||||
|
||||
public Error getError() {
|
||||
return error;
|
||||
}
|
||||
|
||||
public void setError(Error error) {
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UrlReport{" +
|
||||
"status='" + status + '\'' +
|
||||
", payload=" + payload +
|
||||
", error=" + error +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import com.google.common.hash.Hashing;
|
|||
import com.google.common.io.Files;
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
|
||||
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||||
import eu.openaire.publications_retriever.util.file.S3ObjectStoreMinIO;
|
||||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||
|
@ -12,6 +13,7 @@ import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
|||
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||
import eu.openaire.urls_worker.models.Assignment;
|
||||
import eu.openaire.urls_worker.models.Error;
|
||||
import eu.openaire.urls_worker.models.Payload;
|
||||
import eu.openaire.urls_worker.models.UrlReport;
|
||||
import eu.openaire.urls_worker.util.AssignmentHandler;
|
||||
|
@ -91,7 +93,7 @@ public class PublicationsRetrieverPlugin {
|
|||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
|
||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false");
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
return false;
|
||||
}
|
||||
|
@ -114,12 +116,17 @@ public class PublicationsRetrieverPlugin {
|
|||
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
|
||||
} catch (Exception e) {
|
||||
String wasUrlValid = "true";
|
||||
String couldRetry = "false";
|
||||
if ( e instanceof RuntimeException ) {
|
||||
String message = e.getMessage();
|
||||
if ( (message != null) && message.contains("HTTP 404 Client Error") )
|
||||
if ( message != null) {
|
||||
if ( message.contains("HTTP 404 Client Error") )
|
||||
wasUrlValid = "false";
|
||||
else if ( message.contains("Server Error") || message.contains("HTTP 408") )
|
||||
couldRetry = "true"; // We could retry at a later time, as the HTTP-non-404-errors can be temporal.
|
||||
}
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false");
|
||||
}
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
@ -135,19 +142,23 @@ public class PublicationsRetrieverPlugin {
|
|||
}
|
||||
|
||||
|
||||
private static final String DocFileNotRetrievedExceptionName = DocFileNotRetrievedException.class.getName(); // Keep it here for easily spot if the exception changes inside the PublicationsRetriever library.
|
||||
|
||||
public static void addUrlReportsToWorkerReport()
|
||||
{
|
||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||
{
|
||||
String status = null, fileLocation = null, hash = null;
|
||||
Long size = null;
|
||||
Error error = null;
|
||||
String comment = data.getComment();
|
||||
|
||||
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
|
||||
{
|
||||
status = "accessible";
|
||||
fileLocation = data.getComment();
|
||||
if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
|
||||
if ( comment.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
|
||||
// The file of this docUrl was already downloaded by another docUrl.
|
||||
String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
|
||||
String previousId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
|
||||
//logger.debug("previousId: " + previousId); // DEBUG!
|
||||
// Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
|
||||
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
|
||||
|
@ -157,10 +168,10 @@ public class PublicationsRetrieverPlugin {
|
|||
}
|
||||
}
|
||||
}
|
||||
else if ( fileLocation.contains("DocFileNotRetrievedException") )
|
||||
fileLocation = null;
|
||||
|
||||
if ( fileLocation != null ) {
|
||||
else if ( comment.contains(DocFileNotRetrievedExceptionName) )
|
||||
fileLocation = "File not retrieved";
|
||||
else {
|
||||
fileLocation = comment;
|
||||
try {
|
||||
File docFile = new File(fileLocation);
|
||||
if ( docFile.isFile() ) {
|
||||
|
@ -179,13 +190,23 @@ public class PublicationsRetrieverPlugin {
|
|||
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else
|
||||
fileLocation = "File not retrieved";
|
||||
} else
|
||||
}
|
||||
} else {
|
||||
status = "non-accessible";
|
||||
if ( data.getCouldRetry().equals("true") )
|
||||
error = new Error(Error.ErrorType.couldRetry, comment);
|
||||
else
|
||||
error = new Error(Error.ErrorType.noRetry, comment);
|
||||
}
|
||||
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
|
||||
AssignmentHandler.urlReports.add(new UrlReport(status, payload));
|
||||
String docOrDatasetUrl = data.getDocOrDatasetUrl();
|
||||
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
|
||||
docOrDatasetUrl = null;
|
||||
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
|
||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
|
||||
AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||
}
|
||||
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
|
||||
}
|
||||
|
@ -195,7 +216,7 @@ public class PublicationsRetrieverPlugin {
|
|||
try {
|
||||
return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
|
||||
} catch (Exception e) {
|
||||
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false");
|
||||
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,11 +27,13 @@ public class AssignmentHandler {
|
|||
|
||||
public static List<UrlReport> urlReports = null;
|
||||
|
||||
private static final boolean askForTest = true; // Enable this only for testing.
|
||||
|
||||
|
||||
public static AssignmentRequest requestAssignments()
|
||||
{
|
||||
RestTemplate restTemplate = new RestTemplateBuilder().build();
|
||||
String url = "http://localhost:1880/api/urls/test?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT;
|
||||
String url = "http://localhost:1880/api/urls" + (askForTest ? "/test" : "") + "?workerId=" + WorkerConstants.WORKER_ID + "&workerAssignmentsLimit=" + WorkerConstants.ASSIGNMENTS_LIMIT;
|
||||
String json = null;
|
||||
try {
|
||||
json = restTemplate.getForObject(url, String.class);
|
||||
|
@ -58,7 +60,6 @@ public class AssignmentHandler {
|
|||
//logger.debug(assignmentRequest.toString()); // DEBUG!
|
||||
|
||||
logger.info("AssignmentRequest < " + assignmentRequest.getAssignmentCounter() + " > was received and it's ready to be processed. It contains " + assignmentRequest.getAssignments().size() + " tasks.");
|
||||
// TODO - Maybe create a HashSet with these IDs. It may be useful for the Worker to know and report which assignments (and how many) it has processed.
|
||||
|
||||
return assignmentRequest;
|
||||
}
|
||||
|
@ -67,6 +68,10 @@ public class AssignmentHandler {
|
|||
public static void handleAssignments()
|
||||
{
|
||||
AssignmentRequest assignmentRequest = requestAssignments();
|
||||
if ( assignmentRequest == null ) {
|
||||
logger.error("The \"assignmentRequest\" was \"null\"!");
|
||||
return;
|
||||
}
|
||||
|
||||
Long assignmentRequestCounter = assignmentRequest.getAssignmentCounter();
|
||||
|
||||
|
|
Loading…
Reference in New Issue