Browse Source

- Implement the "getAndUploadFullTexts" functionality. In order to access the S3-ObjectStore from one trusted place, the Controller will request the files from the workers and upload them on S3. Afterwards, the workers will delete those files from their local storage. Previously, each worker uploaded its own files.

- Move the "mergeParquetFiles" and "getCutBatchExceptionMessage" methods inside the "FileUtils" class.
- Code cleanup.
springify_project
Lampros Smyrnaios 6 months ago
parent
commit
48eed20dd8
  1. 6
      README.md
  2. 5
      build.gradle
  3. 2
      src/main/java/eu/openaire/urls_controller/Application.java
  4. 84
      src/main/java/eu/openaire/urls_controller/controllers/UrlController.java
  5. 61
      src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java
  6. 408
      src/main/java/eu/openaire/urls_controller/util/FileUtils.java
  7. 226
      src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java

6
README.md

@ -6,6 +6,10 @@ Then it receives the "WorkerReports" and writes them into the database.<br>
The database used is the [Impala](https://impala.apache.org/) .<br>
[...] <br>
To install and run the application, run ```git clone``` and then execute the ```installAndRun.sh``` script.<br>
To install and run the application, run ```git clone```.
Then, provide a file *"S3_minIO_credentials.txt"*, inside the *working directory*.<br>
In the *"S3_minIO_credentials.txt"* file, you should provide the *endpoint*, the *accessKey*, the *secretKey*, the *region* and the *bucket*, in that order, separated by comma.
<br>
Afterwards, execute the ```installAndRun.sh``` script.<br>
If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```.<br>
<br>

5
build.gradle

@ -43,6 +43,11 @@ dependencies {
// https://mvnrepository.com/artifact/com.google.guava/guava
implementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
implementation 'io.minio:minio:8.3.3'
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio.
// https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
implementation group: 'com.cloudera.impala', name: 'jdbc', version: '2.5.31'

2
src/main/java/eu/openaire/urls_controller/Application.java

@ -2,6 +2,7 @@ package eu.openaire.urls_controller;
import eu.openaire.urls_controller.configuration.ImpalaConnector;
import eu.openaire.urls_controller.util.S3ObjectStoreMinIO;
import eu.openaire.urls_controller.util.UriBuilder;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
@ -22,6 +23,7 @@ import java.util.Collections;
public class Application {
public static void main(String[] args) {
new S3ObjectStoreMinIO();
SpringApplication.run(Application.class, args);
}

84
src/main/java/eu/openaire/urls_controller/controllers/UrlController.java

@ -13,6 +13,7 @@ import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import java.sql.*;
import java.sql.Date;
@ -145,11 +146,7 @@ public class UrlController {
ImpalaConnector.databaseLock.unlock();
String errorMsg = "Problem when executing the \"getAssignmentsQuery\"!\n";
logger.error(errorMsg, e);
try {
con.close();
} catch (SQLException sqle2) {
logger.error("Could not close the connection with the Impala-database.\n" + sqle2.getMessage());
}
ImpalaConnector.closeConnection(con);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
} finally {
try {
@ -164,11 +161,7 @@ public class UrlController {
ImpalaConnector.databaseLock.unlock();
String errorMsg = "No results retrieved from the \"getAssignmentsQuery\" for worker with id: " + workerId;
logger.error(errorMsg);
try {
con.close();
} catch (SQLException sqle2) {
logger.error("Could not close the connection with the Impala-database.\n" + sqle2.getMessage());
}
ImpalaConnector.closeConnection(con);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
}
@ -240,7 +233,7 @@ public class UrlController {
logger.debug("Finished inserting " + assignmentsSize + " assignments into the \"assignment\"-table. Going to merge the parquet files for this table.");
String mergeErrorMsg = mergeParquetFiles("assignment", con);
String mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con);
if ( mergeErrorMsg != null ) {
ImpalaConnector.databaseLock.unlock();
closePreparedStatements(preparedInsertAssignmentStatement, null, con);
@ -265,7 +258,7 @@ public class UrlController {
@PostMapping("addWorkerReport")
public ResponseEntity<?> addWorkerReport(@RequestBody WorkerReport workerReport) {
public ResponseEntity<?> addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) {
if ( workerReport == null ) {
String errorMsg = "No \"WorkerReport\" was given!";
@ -280,8 +273,14 @@ public class UrlController {
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
}
logger.info("Received the WorkerReport for batch_ " + workerReport.getAssignmentRequestCounter() + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to insert them into the database.");
logger.info("Received the WorkerReport for batch-assignments_" + workerReport.getAssignmentRequestCounter() + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
// Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location".
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, assignmentsBatchCounter, workerReport.getWorkerId()) ) {
logger.error("Failed to get and/or upload the fullTexts for assignments_" + assignmentsBatchCounter);
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports);
}
ImpalaConnector.databaseLock.lock();
@ -384,14 +383,14 @@ public class UrlController {
logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables. Going to merge the parquet files for those tables.");
String mergeErrorMsg = mergeParquetFiles("payload", con);
String mergeErrorMsg = FileUtils.mergeParquetFiles("payload", con);
if ( mergeErrorMsg != null ) {
ImpalaConnector.databaseLock.unlock();
ImpalaConnector.closeConnection(con);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg);
}
mergeErrorMsg = mergeParquetFiles("attempt", con);
mergeErrorMsg = FileUtils.mergeParquetFiles("attempt", con);
if ( mergeErrorMsg != null ) {
ImpalaConnector.databaseLock.unlock();
ImpalaConnector.closeConnection(con);
@ -415,50 +414,6 @@ public class UrlController {
}
/**
* In each insertion, a new parquet-file is created, so we end up with millions of files. Parquet is great for fast-select, so have to stick with it and merge those files..
* This method, creates a clone of the original table in order to have only one parquet file in the end. Drops the original table.
* Renames the clone to the original's name.
* Returns the errorMsg, if an error appears, otherwise is returns "null".
* */
private static String mergeParquetFiles(String tableName, Connection con)
{
String errorMsg;
if ( tableName == null ) {
errorMsg = "No tableName was given. Do not know the tableName for which we should merger the underlying files for!";
logger.error(errorMsg);
return errorMsg;
}
Statement statement;
try {
statement = con.createStatement();
} catch (SQLException sqle) {
errorMsg = "Problem when creating a connection-statement!\n";
logger.error(errorMsg + sqle.getMessage());
return errorMsg;
}
try {
statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName);
statement.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE");
statement.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName);
statement.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName);
} catch (SQLException sqle) {
errorMsg = "Problem when executing the \"clone-drop-rename\" queries!\n";
logger.error(errorMsg + getCutBatchExceptionMessage(sqle.getMessage()));
sqle.printStackTrace();
return errorMsg;
} finally {
// Make sure we close the statement.
try { statement.close(); }
catch (SQLException sqle3) { logger.error("Could not close the statement for executing queries in the Impala-database.\n" + sqle3); }
}
return null; // No errorMsg, everything is fine.
}
// The "batchExecute" does not work in this Impala-Database, so this is a "giant-query" solution.
// Note: this causes an "Out of memory"-ERROR in the current version of the Impala JDBC driver. If a later version is provided, then this code should be tested.
private static PreparedStatement constructLargeInsertQuery(Connection con, String baseInsertQuery, int dataSize, int numParamsPerRow) throws RuntimeException
@ -489,17 +444,6 @@ public class UrlController {
}
private static String getCutBatchExceptionMessage(String sqleMessage)
{
// The sqleMessage contains the actual message followed by the long batch. This makes the logs unreadable. So we should shorten the message before logging.
int maxEnding = 1500;
if ( sqleMessage.length() > maxEnding )
return (sqleMessage.substring(0, maxEnding) + "...");
else
return sqleMessage;
}
private boolean closePreparedStatements(PreparedStatement preparedStatement1, PreparedStatement preparedStatement2, Connection con) {
try {
if ( preparedStatement1 != null )

61
src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java

@ -0,0 +1,61 @@
package eu.openaire.urls_controller.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
public class FileUnZipper {
private static final Logger logger = LoggerFactory.getLogger(FileUnZipper.class);
public static void unzipFolder(Path source, Path target) throws Exception
{
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())))
{
// Iterate over the files in zip and un-zip them.
ZipEntry zipEntry = zis.getNextEntry();
while ( zipEntry != null )
{
Path targetPath = zipSlipProtect(zipEntry, target);
if ( zipEntry.getName().endsWith(File.separator) ) // If we have a directory.
Files.createDirectories(targetPath);
else {
// Some zip stored file path only, need create parent directories, e.g data/folder/file.txt
if ( targetPath.getParent() != null ) {
if ( Files.notExists(targetPath.getParent()) ) {
Files.createDirectories(targetPath.getParent());
}
}
Files.copy(zis, targetPath, StandardCopyOption.REPLACE_EXISTING);
}
zipEntry = zis.getNextEntry();
}
zis.closeEntry();
}
}
// Protect from a Zip Slip attack: https://snyk.io/research/zip-slip-vulnerability
public static Path zipSlipProtect(ZipEntry zipEntry, Path targetDir) throws IOException
{
Path targetDirResolved = targetDir.resolve(zipEntry.getName());
// Make sure normalized file still has targetDir as its prefix, else throw an exception.
Path normalizePath = targetDirResolved.normalize();
if ( !normalizePath.startsWith(targetDir) ) {
throw new IOException("Bad zip entry: " + zipEntry.getName());
}
return normalizePath;
}
}

408
src/main/java/eu/openaire/urls_controller/util/FileUtils.java

@ -1,18 +1,35 @@
package eu.openaire.urls_controller.util;
import com.google.common.collect.HashMultimap;
import eu.openaire.urls_controller.configuration.ImpalaConnector;
import eu.openaire.urls_controller.models.Payload;
import eu.openaire.urls_controller.models.Task;
import eu.openaire.urls_controller.models.UrlReport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.configurationprocessor.json.JSONException;
import org.springframework.boot.configurationprocessor.json.JSONObject;
import javax.servlet.http.HttpServletRequest;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Scanner;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FileUtils {
@ -46,6 +63,394 @@ public class FileUtils {
}
/**
* In each insertion, a new parquet-file is created, so we end up with millions of files. Parquet is great for fast-select, so have to stick with it and merge those files..
* This method, creates a clone of the original table in order to have only one parquet file in the end. Drops the original table.
* Renames the clone to the original's name.
* Returns the errorMsg, if an error appears, otherwise is returns "null".
* */
public static String mergeParquetFiles(String tableName, Connection con)
{
String errorMsg;
if ( tableName == null ) {
errorMsg = "No tableName was given. Do not know the tableName for which we should merger the underlying files for!";
logger.error(errorMsg);
return errorMsg;
}
Statement statement;
try {
statement = con.createStatement();
} catch (SQLException sqle) {
errorMsg = "Problem when creating a connection-statement!\n";
logger.error(errorMsg + sqle.getMessage());
return errorMsg;
}
try {
statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName);
statement.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE");
statement.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName);
statement.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName);
} catch (SQLException sqle) {
errorMsg = "Problem when executing the \"clone-drop-rename\" queries!\n";
logger.error(errorMsg + getCutBatchExceptionMessage(sqle.getMessage()), sqle);
return errorMsg;
} finally {
// Make sure we close the statement.
try { statement.close(); }
catch (SQLException sqle3) { logger.error("Could not close the statement for executing queries in the Impala-database.\n" + sqle3); }
}
return null; // No errorMsg, everything is fine.
}
private static final Pattern FILENAME_ID = Pattern.compile("([\\w_:]+)\\.[\\w]{2,10}$");
private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:]+\\.[\\w]{2,10})$");
public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator;
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, AtomicLong assignmentsBatchCounter, String workerId)
{
// The Controller have to request the files from the Worker, in order to upload them to the S3.
// We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database.
if ( request == null ) {
logger.error("The \"HttpServletRequest\" is null!");
return false;
}
String remoteAddr = request.getHeader("X-FORWARDED-FOR");
if ( remoteAddr == null || "".equals(remoteAddr) )
remoteAddr = request.getRemoteAddr();
// Get the file-locations.
List<String> allFileNames = new ArrayList<>(urlReports.size()/2);
for ( UrlReport urlReport : urlReports ) {
UrlReport.StatusType statusType = urlReport.getStatus();
if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) {
continue;
}
Payload payload = urlReport.getPayload();
if ( payload != null ) {
String fileLocation = payload.getLocation();
if ( fileLocation != null ) { // If the docFile was downloaded (without an error)..
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
if ( !matcher.matches() ) {
continue;
}
String fileNameWithExtension = matcher.group(1);
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
continue;
}
allFileNames.add(fileNameWithExtension);
}
}
}
int numAllFullTexts = allFileNames.size();
if ( numAllFullTexts == 0 ) {
logger.warn("The file retrieved by the Worker where < 0 > for assignments_" + assignmentsBatchCounter);
return true; // It was handled, no error.
}
// Request the full-texts in batches, compressed in zip.
int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch);
if ( (numAllFullTexts % numOfFullTextsPerBatch) > 0 ) // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
numOfBatches ++;
logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " fullTexts. Going to request them from the Worker, in " + numOfBatches + " batches.");
// Check if one full text is left out because of the division. Put it int the last batch.
String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
// Index all UrlReports to be more efficiently searched later.
HashMap<String, Payload> payloadsHashMap = new HashMap<>(urlReports.size());
for ( UrlReport urlReport : urlReports ) {
Payload payload = urlReport.getPayload();
if ( payload != null )
payloadsHashMap.put(payload.getId(), payload);
}
String curAssignmentsBaseLocation = baseTargetLocation + "assignments_" + assignmentsBatchCounter + File.separator;
File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation);
int failedBatches = 0;
for ( int i=1; i <= numOfBatches; ++i )
{
List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, i, numOfBatches);
HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, i, fileNamesForCurBatch, numOfBatches, workerId);
if ( conn == null ) {
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
failedBatches ++;
continue; // To the next batch.
}
String targetLocation = curAssignmentsBaseLocation + "batch_" + i + File.separator;
File curBatchDir = new File(targetLocation);
try {
// Get the extracted files.,
Path targetPath = Files.createDirectories(Paths.get(targetLocation));
// Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url
String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + i + ".zip";
File zipFile = new File(zipFileFullPath);
if ( ! saveZipFile(conn, zipFile) ) {
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
deleteDirectory(curBatchDir);
failedBatches ++;
continue; // To the next batch.
}
//logger.debug("The zip file has been saved: " + zipFileFullPath); // DEBUG!
FileUnZipper.unzipFolder(Paths.get(zipFileFullPath), targetPath);
String[] fileNames = curBatchDir.list();
if ( (fileNames == null) || (fileNames.length == 0) ) {
logger.error("No filenames where extracted from directory: " + targetLocation);
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
deleteDirectory(curBatchDir);
failedBatches ++;
continue; // To the next batch.
}
// Iterate over the files and upload them to S3.
int numUploadedFiles = 0;
for ( String fileName : fileNames )
{
String fileFullPath = targetLocation + fileName;
if ( fileFullPath.equals(zipFileFullPath) ) { // Exclude the zip-file from uploading.
continue;
}
// Get the ID of the file.
Matcher matcher = FILENAME_ID.matcher(fileName);
if ( !matcher.matches() ) {
continue;
}
String id = matcher.group(1);
if ( (id == null) || id.isEmpty() ) {
continue;
}
Payload payload = payloadsHashMap.get(id);
if ( payload == null ) {
continue;
}
String location = payload.getLocation();
if ( location == null ) {
continue;
}
if ( ! location.endsWith(fileName) ) { // That should NEVER happen...
logger.error("The location \"" + location + "\" of the payload matched with the ID \"" + id + "\" is not ending with the filename it was supposed to \"" + fileName + "\"");
continue;
}
String s3Url = S3ObjectStoreMinIO.uploadToS3(fileName, fileFullPath);
if ( s3Url != null ) {
payload.setLocation(s3Url); // Update the file-location to the new S3-url.
numUploadedFiles ++;
} else
setUnretrievedFullText(payload);
}
logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + " on S3-ObjectStore.");
} catch (Exception e) {
logger.error("Could not extract and upload the full-texts for batch_" + i + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
failedBatches ++;
} finally {
deleteDirectory(curBatchDir); // Delete the files of this batch (including the zip-file).
}
} // End of batches.
// Delete this assignments-num directory.
deleteDirectory(curAssignmentsBaseDir);
// Check if none of the batches were handled..
if ( failedBatches == numOfBatches ) {
logger.error("None of the " + numOfBatches + " batches could be handled!");
return false;
} else {
replaceNotUploadedFileLocations(urlReports);
return true;
}
}
private static HttpURLConnection getConnection(String baseUrl, AtomicLong assignmentsBatchCounter, int batchNum, List<String> fileNamesForCurBatch, int totalBatches, String workerId)
{
String requestUrl = getRequestUrlForBatch(baseUrl, batchNum, fileNamesForCurBatch);
logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
HttpURLConnection conn = null;
try {
conn = (HttpURLConnection) new URL(requestUrl).openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "UrlsController");
conn.connect();
int statusCode = conn.getResponseCode();
if ( statusCode != 200 ) {
logger.warn("HTTP-" + statusCode + ": Problem with when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
return null;
}
} catch (Exception e) {
logger.warn("Problem when requesting the ZipFile of batch_" + batchNum + " of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl + "\n" + e.getMessage());
return null;
}
return conn;
}
private static List<String> getFileNamesForBatch(List<String> allFileNames, int numAllFullTexts, int curBatch, int numOfBatches)
{
int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch);
int endingIndex = (curBatch * numOfFullTextsPerBatch);
if ( endingIndex > numAllFullTexts ) // This might be the case, when the "numAllFullTexts" is too small.
endingIndex = numAllFullTexts;
List<String> fileNamesOfCurBatch = new ArrayList<>(numOfFullTextsPerBatch);
for ( int i = initialIndex; i < endingIndex; ++i ) {
try {
fileNamesOfCurBatch.add(allFileNames.get(i));
} catch (IndexOutOfBoundsException ioobe) {
logger.error("IOOBE for i=" + i + "\n" + ioobe.getMessage(), ioobe);
}
}
return fileNamesOfCurBatch;
}
private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 100);
// TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests.
private static String getRequestUrlForBatch(String baseUrl, int curBatch, List<String> fileNamesForCurBatch)
{
sb.append(baseUrl).append(curBatch).append("/");
int numFullTextsCurBatch = fileNamesForCurBatch.size();
for ( int j=0; j < numFullTextsCurBatch; ++j ){
sb.append(fileNamesForCurBatch.get(j));
if ( j < (numFullTextsCurBatch -1) )
sb.append(",");
}
String requestUrl = sb.toString();
sb.setLength(0); // Reset for the next batch.
return requestUrl;
}
/**
* This method updates the UrlReports to not point to any downloaded fullText files.
* This is useful when the uploading process of the fullTexts to the S3-ObjectStore fails.
* Then, we don't want any "links" to locally stored files, which will be deleted.
* @param urlReports
* @return
*/
public static void updateUrlReportsToHaveNoFullTextFiles(List<UrlReport> urlReports)
{
for ( UrlReport urlReport : urlReports ) {
Payload payload = urlReport.getPayload();
if ( payload != null )
setUnretrievedFullText(payload);
}
}
private static void replaceNotUploadedFileLocations(List<UrlReport> urlReports)
{
for ( UrlReport urlReport : urlReports ) {
Payload payload = urlReport.getPayload();
if ( payload != null ) {
String fileLocation = payload.getLocation();
if ( (fileLocation != null) && (! fileLocation.startsWith(S3ObjectStoreMinIO.endpoint)) )
setUnretrievedFullText(payload);
}
}
}
public static void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMap<String, Payload> payloadsHashMap, List<String> fileNames)
{
for ( String fileName : fileNames ) {
// Get the ID of the file.
Matcher matcher = FILENAME_ID.matcher(fileName);
if ( !matcher.matches() ) {
continue;
}
String id = matcher.group(1);
if ( (id == null) || id.isEmpty() ) {
continue;
}
Payload payload = payloadsHashMap.get(id);
if ( payload != null )
setUnretrievedFullText(payload); // It changes the payload in the original UrlReport list.
}
}
public static void setUnretrievedFullText(Payload payload)
{
// Mark the full-text as not-retrieved, since it will be deleted from local-storage. The retrieved link to the full-text will be kept.
payload.setLocation(null);
payload.setHash(null);
payload.setMime_type(null);
payload.setSize(null);
}
private static final int bufferSize = 20971520; // 20 MB
public static boolean saveZipFile(HttpURLConnection conn, File zipFile)
{
FileOutputStream outStream = null;
InputStream inStream = null;
try {
inStream = conn.getInputStream();
outStream = new FileOutputStream(zipFile);
byte[] byteBuffer = new byte[bufferSize]; // 20 MB
int bytesRead = -1;
while ( (bytesRead = inStream.read(byteBuffer, 0, bufferSize)) != -1 ) {
outStream.write(byteBuffer, 0, bytesRead);
}
return true;
} catch (Exception e) {
logger.error("Could not save the zip file \"" + zipFile.getName() + "\": " + e.getMessage(), e);
return false;
} finally {
try {
if ( inStream != null )
inStream.close();
if ( outStream != null )
outStream.close();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
}
public static boolean deleteDirectory(File curBatchDir) {
try {
org.apache.commons.io.FileUtils.deleteDirectory(curBatchDir);
return true;
} catch (IOException e) {
logger.error("The following directory could not be deleted: " + curBatchDir.getName(), e);
return false;
}
}
private static String getCutBatchExceptionMessage(String sqleMessage)
{
// The sqleMessage contains the actual message followed by the long batch. This makes the logs unreadable. So we should shorten the message before logging.
int maxEnding = 1500;
if ( sqleMessage.length() > maxEnding )
return (sqleMessage.substring(0, maxEnding) + "...");
else
return sqleMessage;
}
// This is currently not used, but it may be useful in a future scenario.
private static long getInputFileLinesNum()
{
@ -85,7 +490,6 @@ public class FileUtils {
logger.warn("The url was not found for id: \"" + idStr + "\"");
return null;
}
return new Task(idStr, urlStr, null);
}
@ -166,6 +570,4 @@ public class FileUtils {
return false;
}
}

226
src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java

@ -0,0 +1,226 @@
package eu.openaire.urls_controller.util;
import io.minio.*;
import io.minio.messages.Bucket;
import io.minio.messages.Item;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class S3ObjectStoreMinIO {
private static final Logger logger = LoggerFactory.getLogger(S3ObjectStoreMinIO.class);
public static String endpoint = null; // This is useful to be "public", to test file-locations.
private static String accessKey = null;
private static String secretKey = null;
private static String region = null;
private static String bucketName = null;
private static MinioClient minioClient;
public static final boolean shouldEmptyBucket = false; // Set true only for testing!
public static final String credentialsFilePath = System.getProperty("user.dir") + File.separator + "S3_minIO_credentials.txt";
private static final boolean shouldShowAllS3Buckets = false;
/**
* This must be called before any other methods.
* */
public S3ObjectStoreMinIO()
{
// Take the credentials from the file.
Scanner myReader = null;
try {
File credentialsFile = new File(credentialsFilePath);
if ( !credentialsFile.exists() ) {
throw new RuntimeException("credentialsFile \"" + credentialsFilePath + "\" does not exists!");
}
myReader = new Scanner(credentialsFile);
if ( myReader.hasNextLine() ) {
String[] credentials = myReader.nextLine().split(",");
if ( credentials.length < 5 ) {
throw new RuntimeException("Not all credentials were retrieved from file \"" + credentialsFilePath + "\"!");
}
endpoint = credentials[0].trim();
accessKey = credentials[1].trim();
secretKey = credentials[2].trim();
region = credentials[3].trim();
bucketName = credentials[4].trim();
}
} catch (Exception e) {
String errorMsg = "An error prevented the retrieval of the minIO credentials from the file: " + credentialsFilePath + "\n" + e.getMessage();
logger.error(errorMsg, e);
System.err.println(errorMsg);
System.exit(53);
} finally {
if ( myReader != null )
myReader.close();
}
if ( (endpoint == null) || (accessKey == null) || (secretKey == null) || (region == null) || (bucketName == null) ) {
String errorMsg = "No \"endpoint\" or/and \"accessKey\" or/and \"secretKey\" or/and \"region\" or/and \"bucketName\" could be retrieved from the file: " + credentialsFilePath;
logger.error(errorMsg);
System.err.println(errorMsg);
System.exit(54);
}
// It's not safe, nor helpful to show the credentials in the logs.
minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).region(region).build();
boolean bucketExists = false;
try {
bucketExists = minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build());
} catch (Exception e) {
String errorMsg = "There was a problem while checking if the bucket \"" + bucketName + "\" exists!\n" + e.getMessage();
logger.error(errorMsg);
System.err.println(errorMsg);
System.exit(55);
}
// Keep this commented-out to avoid objects-deletion by accident. The code is open-sourced, so it's easy to enable this ability if we really want it (e.g. for testing).
/* if ( bucketExists && shouldEmptyBucket ) {
emptyBucket(bucketName, false);
//throw new RuntimeException("stop just for test!");
}*/
// Make the bucket, if not exist.
try {
if ( !bucketExists ) {
logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it..");
minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build());
}
else
logger.warn("Bucket \"" + bucketName + "\" already exists.");
} catch (Exception e) {
String errorMsg = "Could not create the bucket \"" + bucketName + "\"!";
logger.error(errorMsg ,e);
System.err.println(errorMsg);
System.exit(56);
}
if ( shouldShowAllS3Buckets ) {
List<Bucket> buckets = null;
try {
buckets = minioClient.listBuckets();
logger.debug("The buckets in the S3 ObjectStore are:");
for ( Bucket bucket : buckets ) {
logger.debug(bucket.name());
}
} catch (Exception e) {
logger.warn("Could not listBuckets: " + e.getMessage());
}
}
}
public static final Pattern EXTENSION_PATTERN = Pattern.compile("(\\.[^.]+)$");
/**
* @param fileObjKeyName = "**File object key name**";
* @param fileFullPath = "**Path of the file to upload**";
* @return
*/
public static String uploadToS3(String fileObjKeyName, String fileFullPath)
{
String contentType = null;
// Take the Matcher to retrieve the extension.
Matcher extensionMatcher = EXTENSION_PATTERN.matcher(fileFullPath);
if ( extensionMatcher.find() ) {
String extension = null;
if ( (extension = extensionMatcher.group(0)) == null )
contentType = "application/pdf";
else {
if ( extension.equals("pdf") )
contentType = "application/pdf";
/*else if ( *//* TODO - other-extension-match *//* )
contentType = "application/pdf"; */
else
contentType = "application/pdf";
}
} else {
logger.warn("The file with key \"" + fileObjKeyName + "\" does not have a file-extension! Setting the \"pdf\"-mimeType.");
contentType = "application/pdf";
}
ObjectWriteResponse response;
try {
response = minioClient.uploadObject(UploadObjectArgs.builder()
.bucket(bucketName)
.object(fileObjKeyName).filename(fileFullPath)
.contentType(contentType).build());
// TODO - What if the fileObjKeyName already exists?
// Right now it gets overwritten (unless we add versioning, which is irrelevant for different objects..)
} catch (Exception e) {
logger.error("Could not upload the file \"" + fileObjKeyName + "\" to the S3 ObjectStore, exception: " + e.getMessage(), e);
return null;
}
String s3Url = endpoint + "/" + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public.
//logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url);
return s3Url;
}
public static boolean emptyBucket(String bucketName, boolean shouldDeleteBucket)
{
logger.warn("Going to " + (shouldDeleteBucket ? "delete" : "empty") + " bucket \"" + bucketName + "\"");
// First list the objects of the bucket.
Iterable<Result<Item>> results;
try {
results = minioClient.listObjects(ListObjectsArgs.builder().bucket(bucketName).build());
} catch (Exception e) {
logger.error("Could not retrieve the list of objects of bucket \"" + bucketName + "\"!");
return false;
}
// Then, delete the objects.
for ( Result<Item> resultItem : results ) {
try {
if ( !deleteFile(resultItem.get().objectName(), bucketName) ) {
logger.error("Cannot proceed with bucket deletion, since only an empty bucket can be removed!");
return false;
}
} catch (Exception e) {
logger.error("Error getting the object from resultItem: " + resultItem.toString() + "\nThe bucket \"" + bucketName + "\" will not be able to be deleted! Exception message: " + e.getMessage());
return false;
}
}
if ( shouldDeleteBucket ) {
// Lastly, delete the empty bucket.
try {
minioClient.removeBucket(RemoveBucketArgs.builder().bucket(bucketName).build());
} catch (Exception e) {
logger.error("Could not delete the bucket \"" + bucketName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e);
return false;
}
}
return true;
}
public static boolean deleteFile(String fileObjKeyName, String bucketName)
{
try {
minioClient.removeObject(RemoveObjectArgs.builder().bucket(bucketName).object(fileObjKeyName).build());
} catch (Exception e) {
logger.error("Could not delete the file \"" + fileObjKeyName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e);
return false;
}
return true;
}
}
Loading…
Cancel
Save