forked from lsmyrnaios/UrlsController
- Avoid getting and uploading (to S3), full-texts which are already uploaded by previous assignments-batches.
- Fix not updating the fileLocation with the s3Url for records which share the same full-text. - Set only one delete-order for each assignments-batch-files, not one (or more, by mistake) per zip-batch. - Set the HttpStatus to "204 - NO_CONTENT", when no assignments are available to be returned to the Worker. - Fix not unlocking the "dataBaseLock" in case of a "dataBase-connection"-error, in "addWorkerReport()". - Improve some log-messages. - Change the log-level for the "S3-bucket already exists" message. - Update Gradle. - Optimize imports. - Code cleanup.
This commit is contained in:
parent
0178e44574
commit
33ba3e8d91
|
@ -1,5 +1,5 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.1-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
|
|
@ -8,7 +8,7 @@ elif [[ $# -gt 1 ]]; then
|
|||
echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh <justInstall: 0 | 1>"; exit 1
|
||||
fi
|
||||
|
||||
gradleVersion="7.3.1"
|
||||
gradleVersion="7.3.2"
|
||||
|
||||
if [[ justInstall -eq 0 ]]; then
|
||||
|
||||
|
|
|
@ -8,8 +8,8 @@ import org.springframework.boot.CommandLineRunner;
|
|||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
import org.springframework.core.env.Environment;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
import org.springframework.web.cors.CorsConfiguration;
|
||||
import org.springframework.web.cors.CorsConfigurationSource;
|
||||
import org.springframework.web.cors.UrlBasedCorsConfigurationSource;
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.openaire.urls_controller.components;
|
|||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
//import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
|
|
|
@ -4,6 +4,7 @@ import com.zaxxer.hikari.HikariConfig;
|
|||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.beans.PropertyVetoException;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
|
|
|
@ -6,7 +6,9 @@ import eu.openaire.urls_controller.models.Error;
|
|||
import eu.openaire.urls_controller.models.*;
|
||||
import eu.openaire.urls_controller.payloads.requests.WorkerReport;
|
||||
import eu.openaire.urls_controller.payloads.responces.AssignmentsResponse;
|
||||
import eu.openaire.urls_controller.util.*;
|
||||
import eu.openaire.urls_controller.util.ControllerConstants;
|
||||
import eu.openaire.urls_controller.util.FileUtils;
|
||||
import eu.openaire.urls_controller.util.GenericUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.HttpStatus;
|
||||
|
@ -15,7 +17,6 @@ import org.springframework.web.bind.annotation.*;
|
|||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.sql.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -151,7 +152,7 @@ public class UrlController {
|
|||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "No results retrieved from the \"getAssignmentsQuery\" for worker with id: " + workerId;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.NO_CONTENT).body(errorMsg);
|
||||
}*/
|
||||
|
||||
// The cursor is automatically before the first element in this configuration.
|
||||
|
@ -199,7 +200,7 @@ public class UrlController {
|
|||
errorMsg = "No results retrieved from the \"findAssignmentsQuery\" for worker with id: " + workerId;
|
||||
logger.error(errorMsg);
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.NO_CONTENT).body(errorMsg);
|
||||
}
|
||||
|
||||
logger.debug("Finished gathering " + assignmentsSize + " assignments for worker with id \"" + workerId + "\". Going to insert them into the \"assignment\" table and then return them to the worker.");
|
||||
|
@ -286,19 +287,24 @@ public class UrlController {
|
|||
long curReportAssignments = workerReport.getAssignmentRequestCounter();
|
||||
logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + curWorkerId + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
|
||||
|
||||
ImpalaConnector.databaseLock.lock();
|
||||
|
||||
Connection con = ImpalaConnector.getInstance().getConnection();
|
||||
if ( con == null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!");
|
||||
}
|
||||
|
||||
// Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location".
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, curWorkerId) ) {
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, con, request, curReportAssignments, curWorkerId) ) {
|
||||
logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments);
|
||||
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
|
||||
FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports);
|
||||
}
|
||||
|
||||
// The "databaseLock" was unlocked inside the "FileUtils.getAndUploadFullTexts" to avoid blocking the database while doing large irrelevant tasks like transferring files.
|
||||
ImpalaConnector.databaseLock.lock();
|
||||
|
||||
Connection con = ImpalaConnector.getInstance().getConnection();
|
||||
if ( con == null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!");
|
||||
|
||||
// Store the workerReport into the database.
|
||||
String insertIntoPayloadBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
|
||||
String insertIntoAttemptBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)";
|
||||
|
@ -322,8 +328,8 @@ public class UrlController {
|
|||
con.setAutoCommit(false); // Avoid writing to disk for each insert. Write them all in the end.
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "Problem when setting Connection.AutoCommit to \"false\"!";
|
||||
logger.error(errorMsg + "\n" + sqle.getMessage());
|
||||
String errorMsg = "Problem when setting Connection.AutoCommit to \"false\"!\n";
|
||||
logger.error(errorMsg + sqle.getMessage());
|
||||
closePreparedStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
}
|
||||
|
@ -362,7 +368,7 @@ public class UrlController {
|
|||
}
|
||||
|
||||
Error error = urlReport.getError();
|
||||
if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of the loop)
|
||||
if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop).
|
||||
logger.warn("Error was \"null\" for \"urlReport\": " + urlReport + "\nSetting an empty object with \"null\" members.");
|
||||
error = new Error(null, null);
|
||||
}
|
||||
|
@ -409,6 +415,7 @@ public class UrlController {
|
|||
|
||||
// This will delete the rows of the "assignment" table which refer to the curWorkerId. As we have non-kudu Impala tables, the Delete operation can only succeed through a "merge" operation of the rest of the data.
|
||||
// Only the rows referring to OTHER workerIDs get stored in a temp-table, while the "assignment" table gets deleted. Then, the temp_table becomes the "assignment" table.
|
||||
// We do not need to keep the assignment-info anymore, the "findAssignmentsQuery" checks the payload table for previously handled tasks.
|
||||
mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con, " WHERE workerid != ", curWorkerId);
|
||||
if ( mergeErrorMsg != null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
|
|
|
@ -20,7 +20,7 @@ public class FileUnZipper {
|
|||
|
||||
public static void unzipFolder(Path source, Path target) throws Exception
|
||||
{
|
||||
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())))
|
||||
try ( ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())) )
|
||||
{
|
||||
// Iterate over the files in zip and un-zip them.
|
||||
ZipEntry zipEntry = zis.getNextEntry();
|
||||
|
@ -31,13 +31,12 @@ public class FileUnZipper {
|
|||
if ( zipEntry.getName().endsWith(File.separator) ) // If we have a directory.
|
||||
Files.createDirectories(targetPath);
|
||||
else {
|
||||
// Some zip stored file path only, need create parent directories, e.g data/folder/file.txt
|
||||
if ( targetPath.getParent() != null ) {
|
||||
if ( Files.notExists(targetPath.getParent()) ) {
|
||||
Files.createDirectories(targetPath.getParent());
|
||||
}
|
||||
// Some zip -files store only the file-paths and not separate directories. We need to create parent directories, e.g data/folder/file.txt
|
||||
Path parentPath = targetPath.getParent();
|
||||
if ( (parentPath != null) && Files.notExists(parentPath) ) {
|
||||
Files.createDirectories(parentPath);
|
||||
}
|
||||
Files.copy(zis, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.copy(zis, targetPath, StandardCopyOption.REPLACE_EXISTING); // Copy an individual entry.
|
||||
}
|
||||
zipEntry = zis.getNextEntry();
|
||||
}
|
||||
|
|
|
@ -17,13 +17,8 @@ import java.net.URL;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
import java.sql.*;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -116,29 +111,77 @@ public class FileUtils {
|
|||
public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator;
|
||||
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
||||
|
||||
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId)
|
||||
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, Connection con, HttpServletRequest request, long assignmentsBatchCounter, String workerId)
|
||||
{
|
||||
// The Controller have to request the files from the Worker, in order to upload them to the S3.
|
||||
// We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database.
|
||||
|
||||
if ( request == null ) {
|
||||
logger.error("The \"HttpServletRequest\" is null!");
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
return false;
|
||||
}
|
||||
String remoteAddr = request.getHeader("X-FORWARDED-FOR");
|
||||
if ( remoteAddr == null || "".equals(remoteAddr) )
|
||||
remoteAddr = request.getRemoteAddr();
|
||||
|
||||
String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ?" ;
|
||||
PreparedStatement getFileLocationForHashPreparedStatement = null;
|
||||
try {
|
||||
getFileLocationForHashPreparedStatement = con.prepareStatement(getFileLocationForHashQuery);
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
logger.error("Problem when creating the prepared statement for \"" + getFileLocationForHashQuery + "\"!\n" + sqle.getMessage());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the file-locations.
|
||||
List<String> allFileNames = new ArrayList<>(urlReports.size()/2);
|
||||
for ( UrlReport urlReport : urlReports ) {
|
||||
int numFullTextUrlsFound = 0;
|
||||
int numFilesFoundFromPreviousAssignmentsBatches = 0;
|
||||
HashMultimap<String, String> allFileNamesWithIDsHashMap = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
||||
|
||||
for ( UrlReport urlReport : urlReports )
|
||||
{
|
||||
UrlReport.StatusType statusType = urlReport.getStatus();
|
||||
if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) {
|
||||
continue;
|
||||
}
|
||||
numFullTextUrlsFound ++;
|
||||
|
||||
Payload payload = urlReport.getPayload();
|
||||
if ( payload != null ) {
|
||||
String fileLocation = payload.getLocation();
|
||||
if ( payload != null )
|
||||
{
|
||||
String fileLocation = null;
|
||||
|
||||
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
|
||||
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
|
||||
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
|
||||
// Use the same prepared-statement for all requests, to improve speed (just like when inserting similar thing to the DB).
|
||||
String fileHash = payload.getHash();
|
||||
if ( fileHash != null ) {
|
||||
try {
|
||||
getFileLocationForHashPreparedStatement.setString(1, fileHash);
|
||||
} catch (SQLException sqle) {
|
||||
logger.error("Error when setting the parameter in \"getFileLocationForHashQuery\"!\n" + sqle.getMessage());
|
||||
}
|
||||
|
||||
try ( ResultSet resultSet = getFileLocationForHashPreparedStatement.executeQuery() ) {
|
||||
if ( resultSet.next() ) { // Move the "cursor" to the first row. If there is any data..
|
||||
fileLocation = resultSet.getString(1);
|
||||
if ( fileLocation != null ) { // If the full-text of this record is already-found.
|
||||
payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3.
|
||||
logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\".");
|
||||
numFilesFoundFromPreviousAssignmentsBatches ++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// If the full-text of this record was not found by a previous batch..
|
||||
fileLocation = payload.getLocation();
|
||||
if ( fileLocation != null ) { // If the docFile was downloaded (without an error)..
|
||||
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
|
||||
if ( !matcher.matches() ) {
|
||||
|
@ -148,11 +191,26 @@ public class FileUtils {
|
|||
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
||||
continue;
|
||||
}
|
||||
allFileNames.add(fileNameWithExtension);
|
||||
allFileNamesWithIDsHashMap.put(fileNameWithExtension, payload.getId()); // The keys and the values are not duplicate. Task with ID-1 might have an "ID-1.pdf" file.
|
||||
// While a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close the Prepared Statement.
|
||||
try {
|
||||
if ( getFileLocationForHashPreparedStatement != null )
|
||||
getFileLocationForHashPreparedStatement.close();
|
||||
} catch (SQLException sqle) {
|
||||
logger.error("Failed to close the \"getFileLocationForHashPreparedStatement\"!\n" + sqle.getMessage());
|
||||
} finally {
|
||||
ImpalaConnector.databaseLock.unlock(); // The rest work of this function does not use the database.
|
||||
}
|
||||
|
||||
logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ").");
|
||||
logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches);
|
||||
|
||||
ArrayList<String> allFileNames = new ArrayList<>(allFileNamesWithIDsHashMap.keySet());
|
||||
int numAllFullTexts = allFileNames.size();
|
||||
if ( numAllFullTexts == 0 ) {
|
||||
logger.warn("The file retrieved by the Worker where < 0 > for assignments_" + assignmentsBatchCounter);
|
||||
|
@ -164,17 +222,17 @@ public class FileUtils {
|
|||
if ( (numAllFullTexts % numOfFullTextsPerBatch) > 0 ) // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
|
||||
numOfBatches ++;
|
||||
|
||||
logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " fullTexts. Going to request them from the Worker, in " + numOfBatches + " batches.");
|
||||
logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches.");
|
||||
|
||||
// Check if one full text is left out because of the division. Put it int the last batch.
|
||||
String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
|
||||
|
||||
// Index all UrlReports to be more efficiently searched later.
|
||||
HashMap<String, Payload> payloadsHashMap = new HashMap<>(urlReports.size());
|
||||
// Index all Payloads to be more efficiently searched later.
|
||||
HashMultimap<String, Payload> payloadsHashMultimap = HashMultimap.create((urlReports.size() / 3), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
||||
for ( UrlReport urlReport : urlReports ) {
|
||||
Payload payload = urlReport.getPayload();
|
||||
if ( payload != null )
|
||||
payloadsHashMap.put(payload.getId(), payload);
|
||||
payloadsHashMultimap.put(payload.getId(), payload);
|
||||
}
|
||||
|
||||
String curAssignmentsBaseLocation = baseTargetLocation + "assignments_" + assignmentsBatchCounter + File.separator;
|
||||
|
@ -186,37 +244,35 @@ public class FileUtils {
|
|||
List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter);
|
||||
HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId);
|
||||
if ( conn == null ) {
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
continue; // To the next batch.
|
||||
}
|
||||
|
||||
String targetLocation = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
|
||||
File curBatchDir = new File(targetLocation);
|
||||
// Get the extracted files.
|
||||
String targetDirectory = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
|
||||
try {
|
||||
// Get the extracted files.,
|
||||
Path targetPath = Files.createDirectories(Paths.get(targetLocation));
|
||||
// Create this batch-directory.
|
||||
Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
|
||||
|
||||
// Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url
|
||||
String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip";
|
||||
String zipFileFullPath = targetDirectory + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip";
|
||||
File zipFile = new File(zipFileFullPath);
|
||||
|
||||
if ( ! saveZipFile(conn, zipFile) ) {
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
deleteDirectory(curBatchDir);
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
continue; // To the next batch.
|
||||
}
|
||||
|
||||
//logger.debug("The zip file has been saved: " + zipFileFullPath); // DEBUG!
|
||||
|
||||
FileUnZipper.unzipFolder(Paths.get(zipFileFullPath), targetPath);
|
||||
FileUnZipper.unzipFolder(Paths.get(zipFileFullPath), curBatchPath);
|
||||
|
||||
String[] fileNames = curBatchDir.list();
|
||||
if ( (fileNames == null) || (fileNames.length == 0) ) {
|
||||
logger.error("No filenames where extracted from directory: " + targetLocation);
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
deleteDirectory(curBatchDir);
|
||||
String[] fileNames = new File(targetDirectory).list();
|
||||
if ( (fileNames == null) || (fileNames.length <= 1 ) ) { // The directory might have only one file, the "zip-file".
|
||||
logger.error("No full-text fileNames where extracted from directory: " + targetDirectory);
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
continue; // To the next batch.
|
||||
}
|
||||
|
@ -225,48 +281,38 @@ public class FileUtils {
|
|||
int numUploadedFiles = 0;
|
||||
for ( String fileName : fileNames )
|
||||
{
|
||||
String fileFullPath = targetLocation + fileName;
|
||||
if ( fileFullPath.equals(zipFileFullPath) ) { // Exclude the zip-file from uploading.
|
||||
String fileFullPath = targetDirectory + fileName;
|
||||
if ( fileFullPath.equals(zipFileFullPath) ) // Exclude the zip-file from uploading.
|
||||
continue;
|
||||
|
||||
// Check if this stored file is related to one or more IDs from the Set.
|
||||
Set<String> fileRelatedIDs = allFileNamesWithIDsHashMap.get(fileName);
|
||||
if ( fileRelatedIDs.isEmpty() ) { // In case the "fileName" is not inside the "allFileNamesWithIDsHashMap" HashMultimap.
|
||||
logger.error("The stored file \"" + fileName + "\" is not related to any ID which had a file requested from the Worker!");
|
||||
continue;
|
||||
}
|
||||
// Get the ID of the file.
|
||||
Matcher matcher = FILENAME_ID.matcher(fileName);
|
||||
if ( !matcher.matches() ) {
|
||||
|
||||
if ( isFileNameProblematic(fileName, payloadsHashMultimap) ) // Do some more checks.
|
||||
continue;
|
||||
}
|
||||
String id = matcher.group(1);
|
||||
if ( (id == null) || id.isEmpty() ) {
|
||||
continue;
|
||||
}
|
||||
Payload payload = payloadsHashMap.get(id);
|
||||
if ( payload == null ) {
|
||||
continue;
|
||||
}
|
||||
String location = payload.getLocation();
|
||||
if ( location == null ) {
|
||||
continue;
|
||||
}
|
||||
if ( ! location.endsWith(fileName) ) { // That should NEVER happen...
|
||||
logger.error("The location \"" + location + "\" of the payload matched with the ID \"" + id + "\" is not ending with the filename it was supposed to \"" + fileName + "\"");
|
||||
continue;
|
||||
}
|
||||
|
||||
// At this point, we know that this file is related with one or more IDs of the payloads AND it has a valid fileName.
|
||||
// Let's try to upload the file to S3 and update the payloads of all related IDs, either in successful upload or not.
|
||||
|
||||
String s3Url = S3ObjectStoreMinIO.uploadToS3(fileName, fileFullPath);
|
||||
if ( s3Url != null ) {
|
||||
payload.setLocation(s3Url); // Update the file-location to the new S3-url.
|
||||
numUploadedFiles ++;
|
||||
} else
|
||||
setUnretrievedFullText(payload);
|
||||
setFullTextForMultipleIDs(payloadsHashMultimap, fileRelatedIDs, s3Url); // It checks weather (s3Url != null) and acts accordingly.
|
||||
numUploadedFiles++;
|
||||
}
|
||||
// Else, the record will have its file-data set to "null", in the end of this method.
|
||||
}
|
||||
|
||||
logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
|
||||
logger.info("Finished uploading " + numUploadedFiles + " full-texts (out of " + (fileNames.length -1) + " distinct files) from assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
|
||||
// (fileNames.length -1) --> minus the zip-file
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
} finally {
|
||||
deleteDirectory(curBatchDir); // Delete the files of this batch (including the zip-file).
|
||||
}
|
||||
} // End of batches.
|
||||
|
||||
|
@ -278,7 +324,7 @@ public class FileUtils {
|
|||
logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId);
|
||||
return false;
|
||||
} else {
|
||||
replaceNotUploadedFileLocations(urlReports);
|
||||
replaceNotUploadedFileLocations(urlReports); // Make sure all records without an s3Url have null file-data.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -315,9 +361,8 @@ public class FileUtils {
|
|||
String inputLine;
|
||||
while ( (inputLine = br.readLine()) != null )
|
||||
{
|
||||
if ( !inputLine.isEmpty() ) {
|
||||
if ( !inputLine.isEmpty() )
|
||||
errorMsgStrB.append(inputLine);
|
||||
}
|
||||
}
|
||||
return (errorMsgStrB.length() != 0) ? errorMsgStrB.toString() : null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function.
|
||||
} catch ( IOException ioe ) {
|
||||
|
@ -367,6 +412,74 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
private static final int bufferSize = 20971520; // 20 MB
|
||||
public static boolean saveZipFile(HttpURLConnection conn, File zipFile)
|
||||
{
|
||||
InputStream inStream = null;
|
||||
FileOutputStream outStream = null;
|
||||
try {
|
||||
inStream = conn.getInputStream();
|
||||
outStream = new FileOutputStream(zipFile);
|
||||
byte[] byteBuffer = new byte[bufferSize]; // 20 MB
|
||||
int bytesRead = -1;
|
||||
while ( (bytesRead = inStream.read(byteBuffer, 0, bufferSize)) != -1 ) {
|
||||
outStream.write(byteBuffer, 0, bytesRead);
|
||||
}
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not save the zip file \"" + zipFile.getName() + "\": " + e.getMessage(), e);
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if ( inStream != null )
|
||||
inStream.close();
|
||||
if ( outStream != null )
|
||||
outStream.close();
|
||||
} catch (Exception e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean isFileNameProblematic(String fileName, HashMultimap<String, Payload> payloadsHashMultimap)
|
||||
{
|
||||
// Get the ID of the file.
|
||||
Matcher matcher = FILENAME_ID.matcher(fileName);
|
||||
if ( !matcher.matches() ) {
|
||||
logger.error("The given fileName \"" + fileName + "\" was invalid! Could not be matched with matcher: " + matcher);
|
||||
return true;
|
||||
}
|
||||
String fileID = matcher.group(1);
|
||||
if ( (fileID == null) || fileID.isEmpty() ) {
|
||||
logger.error("The given fileName \"" + fileName + "\" was invalid. No fileID was extracted!");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Take the payloads which are related with this ID. An ID might have multiple original-urls, thus multiple payloads.
|
||||
// The ID we have here, is the one from the first record which reached to this file.
|
||||
// There might be other records pointing to this file. But, in order to mark this file as "valid", we have to match it with at least one of the records-IDs.
|
||||
// We do this process to avoid handling and uploading irrelevant files which could find their way to the working directory (either because of a Worker's error or any other type of malfunction or even malicious action).
|
||||
|
||||
Set<Payload> payloads = payloadsHashMultimap.get(fileID);
|
||||
if ( payloads.isEmpty() ) {
|
||||
logger.error("The given fileID \"" + fileID + "\" was not part of the \"payloadsHashMultimap\"!");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Search through the payloads to find at least one match, in order for this file to NOT be "problematic".
|
||||
for ( Payload payload : payloads )
|
||||
{
|
||||
String location = payload.getLocation();
|
||||
if ( (location != null) && location.endsWith(fileName) )
|
||||
return false; // It's not problematic.
|
||||
}
|
||||
|
||||
logger.error("None of the locations of the payloads matched with the ID \"" + fileID + "\" are ending with the filename \"" + fileName + "\" they were supposed to.");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method updates the UrlReports to not point to any downloaded fullText files.
|
||||
* This is useful when the uploading process of the fullTexts to the S3-ObjectStore fails.
|
||||
|
@ -397,7 +510,7 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
public static void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMap<String, Payload> payloadsHashMap, List<String> fileNames)
|
||||
public static void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMultimap<String, Payload> payloadsHashMultimap, List<String> fileNames)
|
||||
{
|
||||
for ( String fileName : fileNames ) {
|
||||
// Get the ID of the file.
|
||||
|
@ -409,9 +522,11 @@ public class FileUtils {
|
|||
if ( (id == null) || id.isEmpty() ) {
|
||||
continue;
|
||||
}
|
||||
Payload payload = payloadsHashMap.get(id);
|
||||
if ( payload != null )
|
||||
setUnretrievedFullText(payload); // It changes the payload in the original UrlReport list.
|
||||
Set<Payload> payloads = payloadsHashMultimap.get(id);
|
||||
// Set for all payloads connected to this ID.
|
||||
for ( Payload payload : payloads )
|
||||
if ( payload != null )
|
||||
setUnretrievedFullText(payload); // It changes the payload in the original UrlReport list.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -426,33 +541,24 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
private static final int bufferSize = 20971520; // 20 MB
|
||||
public static boolean saveZipFile(HttpURLConnection conn, File zipFile)
|
||||
/**
|
||||
* Set the fileLocation for all those IDs related to the File. The IDs may have one or more payloads.
|
||||
* @param payloadsHashMultimap
|
||||
* @param fileIDs
|
||||
* @param s3Url
|
||||
*/
|
||||
public static void setFullTextForMultipleIDs(HashMultimap<String, Payload> payloadsHashMultimap, Set<String> fileIDs, String s3Url)
|
||||
{
|
||||
FileOutputStream outStream = null;
|
||||
InputStream inStream = null;
|
||||
try {
|
||||
inStream = conn.getInputStream();
|
||||
outStream = new FileOutputStream(zipFile);
|
||||
for ( String id : fileIDs ) {
|
||||
Set<Payload> payloads = payloadsHashMultimap.get(id);
|
||||
if ( payloads.isEmpty() ) {
|
||||
logger.error("The given id \"" + id + "\" (coming from the \"allFileNamesWithIDsHashMap\"), is not found inside the \"payloadsHashMultimap\"!");
|
||||
continue;
|
||||
}
|
||||
|
||||
byte[] byteBuffer = new byte[bufferSize]; // 20 MB
|
||||
int bytesRead = -1;
|
||||
while ( (bytesRead = inStream.read(byteBuffer, 0, bufferSize)) != -1 ) {
|
||||
outStream.write(byteBuffer, 0, bytesRead);
|
||||
}
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not save the zip file \"" + zipFile.getName() + "\": " + e.getMessage(), e);
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if ( inStream != null )
|
||||
inStream.close();
|
||||
if ( outStream != null )
|
||||
outStream.close();
|
||||
} catch (Exception e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
for ( Payload payload : payloads )
|
||||
if ( payload.getHash() != null ) // Update only for the records which led to a file, not all the records of this ID (an ID might have multiple original_urls pointing to different directions).
|
||||
payload.setLocation(s3Url); // Update the file-location to the new S3-url. All the other file-data is already set from the Worker.
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -97,7 +97,7 @@ public class S3ObjectStoreMinIO {
|
|||
minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build());
|
||||
}
|
||||
else
|
||||
logger.warn("Bucket \"" + bucketName + "\" already exists.");
|
||||
logger.debug("Bucket \"" + bucketName + "\" already exists.");
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Could not create the bucket \"" + bucketName + "\"!";
|
||||
logger.error(errorMsg ,e);
|
||||
|
|
Loading…
Reference in New Issue