- Implement the "getAndUploadFullTexts" functionality. In order to access the S3-ObjectStore from one trusted place, the Controller will request the files from the workers and upload them on S3. Afterwards, the workers will delete those files from their local storage. Previously, each worker uploaded its own files.
- Move the "mergeParquetFiles" and "getCutBatchExceptionMessage" methods inside the "FileUtils" class. - Code cleanup.springify_project
parent
780ed15ce2
commit
48eed20dd8
@ -0,0 +1,61 @@
|
|||||||
|
package eu.openaire.urls_controller.util;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipInputStream;
|
||||||
|
|
||||||
|
|
||||||
|
public class FileUnZipper {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(FileUnZipper.class);
|
||||||
|
|
||||||
|
|
||||||
|
public static void unzipFolder(Path source, Path target) throws Exception
|
||||||
|
{
|
||||||
|
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())))
|
||||||
|
{
|
||||||
|
// Iterate over the files in zip and un-zip them.
|
||||||
|
ZipEntry zipEntry = zis.getNextEntry();
|
||||||
|
while ( zipEntry != null )
|
||||||
|
{
|
||||||
|
Path targetPath = zipSlipProtect(zipEntry, target);
|
||||||
|
|
||||||
|
if ( zipEntry.getName().endsWith(File.separator) ) // If we have a directory.
|
||||||
|
Files.createDirectories(targetPath);
|
||||||
|
else {
|
||||||
|
// Some zip stored file path only, need create parent directories, e.g data/folder/file.txt
|
||||||
|
if ( targetPath.getParent() != null ) {
|
||||||
|
if ( Files.notExists(targetPath.getParent()) ) {
|
||||||
|
Files.createDirectories(targetPath.getParent());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Files.copy(zis, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
zipEntry = zis.getNextEntry();
|
||||||
|
}
|
||||||
|
zis.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Protect from a Zip Slip attack: https://snyk.io/research/zip-slip-vulnerability
|
||||||
|
public static Path zipSlipProtect(ZipEntry zipEntry, Path targetDir) throws IOException
|
||||||
|
{
|
||||||
|
Path targetDirResolved = targetDir.resolve(zipEntry.getName());
|
||||||
|
// Make sure normalized file still has targetDir as its prefix, else throw an exception.
|
||||||
|
Path normalizePath = targetDirResolved.normalize();
|
||||||
|
if ( !normalizePath.startsWith(targetDir) ) {
|
||||||
|
throw new IOException("Bad zip entry: " + zipEntry.getName());
|
||||||
|
}
|
||||||
|
return normalizePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,226 @@
|
|||||||
|
package eu.openaire.urls_controller.util;
|
||||||
|
|
||||||
|
import io.minio.*;
|
||||||
|
import io.minio.messages.Bucket;
|
||||||
|
import io.minio.messages.Item;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Scanner;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
|
||||||
|
public class S3ObjectStoreMinIO {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(S3ObjectStoreMinIO.class);
|
||||||
|
|
||||||
|
public static String endpoint = null; // This is useful to be "public", to test file-locations.
|
||||||
|
private static String accessKey = null;
|
||||||
|
private static String secretKey = null;
|
||||||
|
private static String region = null;
|
||||||
|
private static String bucketName = null;
|
||||||
|
|
||||||
|
private static MinioClient minioClient;
|
||||||
|
|
||||||
|
public static final boolean shouldEmptyBucket = false; // Set true only for testing!
|
||||||
|
public static final String credentialsFilePath = System.getProperty("user.dir") + File.separator + "S3_minIO_credentials.txt";
|
||||||
|
private static final boolean shouldShowAllS3Buckets = false;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This must be called before any other methods.
|
||||||
|
* */
|
||||||
|
public S3ObjectStoreMinIO()
|
||||||
|
{
|
||||||
|
// Take the credentials from the file.
|
||||||
|
Scanner myReader = null;
|
||||||
|
try {
|
||||||
|
File credentialsFile = new File(credentialsFilePath);
|
||||||
|
if ( !credentialsFile.exists() ) {
|
||||||
|
throw new RuntimeException("credentialsFile \"" + credentialsFilePath + "\" does not exists!");
|
||||||
|
}
|
||||||
|
myReader = new Scanner(credentialsFile);
|
||||||
|
if ( myReader.hasNextLine() ) {
|
||||||
|
String[] credentials = myReader.nextLine().split(",");
|
||||||
|
if ( credentials.length < 5 ) {
|
||||||
|
throw new RuntimeException("Not all credentials were retrieved from file \"" + credentialsFilePath + "\"!");
|
||||||
|
}
|
||||||
|
endpoint = credentials[0].trim();
|
||||||
|
accessKey = credentials[1].trim();
|
||||||
|
secretKey = credentials[2].trim();
|
||||||
|
region = credentials[3].trim();
|
||||||
|
bucketName = credentials[4].trim();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
String errorMsg = "An error prevented the retrieval of the minIO credentials from the file: " + credentialsFilePath + "\n" + e.getMessage();
|
||||||
|
logger.error(errorMsg, e);
|
||||||
|
System.err.println(errorMsg);
|
||||||
|
System.exit(53);
|
||||||
|
} finally {
|
||||||
|
if ( myReader != null )
|
||||||
|
myReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (endpoint == null) || (accessKey == null) || (secretKey == null) || (region == null) || (bucketName == null) ) {
|
||||||
|
String errorMsg = "No \"endpoint\" or/and \"accessKey\" or/and \"secretKey\" or/and \"region\" or/and \"bucketName\" could be retrieved from the file: " + credentialsFilePath;
|
||||||
|
logger.error(errorMsg);
|
||||||
|
System.err.println(errorMsg);
|
||||||
|
System.exit(54);
|
||||||
|
}
|
||||||
|
// It's not safe, nor helpful to show the credentials in the logs.
|
||||||
|
|
||||||
|
minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).region(region).build();
|
||||||
|
|
||||||
|
boolean bucketExists = false;
|
||||||
|
try {
|
||||||
|
bucketExists = minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build());
|
||||||
|
} catch (Exception e) {
|
||||||
|
String errorMsg = "There was a problem while checking if the bucket \"" + bucketName + "\" exists!\n" + e.getMessage();
|
||||||
|
logger.error(errorMsg);
|
||||||
|
System.err.println(errorMsg);
|
||||||
|
System.exit(55);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep this commented-out to avoid objects-deletion by accident. The code is open-sourced, so it's easy to enable this ability if we really want it (e.g. for testing).
|
||||||
|
/* if ( bucketExists && shouldEmptyBucket ) {
|
||||||
|
emptyBucket(bucketName, false);
|
||||||
|
//throw new RuntimeException("stop just for test!");
|
||||||
|
}*/
|
||||||
|
|
||||||
|
// Make the bucket, if not exist.
|
||||||
|
try {
|
||||||
|
if ( !bucketExists ) {
|
||||||
|
logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it..");
|
||||||
|
minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
logger.warn("Bucket \"" + bucketName + "\" already exists.");
|
||||||
|
} catch (Exception e) {
|
||||||
|
String errorMsg = "Could not create the bucket \"" + bucketName + "\"!";
|
||||||
|
logger.error(errorMsg ,e);
|
||||||
|
System.err.println(errorMsg);
|
||||||
|
System.exit(56);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( shouldShowAllS3Buckets ) {
|
||||||
|
List<Bucket> buckets = null;
|
||||||
|
try {
|
||||||
|
buckets = minioClient.listBuckets();
|
||||||
|
logger.debug("The buckets in the S3 ObjectStore are:");
|
||||||
|
for ( Bucket bucket : buckets ) {
|
||||||
|
logger.debug(bucket.name());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Could not listBuckets: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static final Pattern EXTENSION_PATTERN = Pattern.compile("(\\.[^.]+)$");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param fileObjKeyName = "**File object key name**";
|
||||||
|
* @param fileFullPath = "**Path of the file to upload**";
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static String uploadToS3(String fileObjKeyName, String fileFullPath)
|
||||||
|
{
|
||||||
|
String contentType = null;
|
||||||
|
|
||||||
|
// Take the Matcher to retrieve the extension.
|
||||||
|
Matcher extensionMatcher = EXTENSION_PATTERN.matcher(fileFullPath);
|
||||||
|
if ( extensionMatcher.find() ) {
|
||||||
|
String extension = null;
|
||||||
|
if ( (extension = extensionMatcher.group(0)) == null )
|
||||||
|
contentType = "application/pdf";
|
||||||
|
else {
|
||||||
|
if ( extension.equals("pdf") )
|
||||||
|
contentType = "application/pdf";
|
||||||
|
/*else if ( *//* TODO - other-extension-match *//* )
|
||||||
|
contentType = "application/pdf"; */
|
||||||
|
else
|
||||||
|
contentType = "application/pdf";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.warn("The file with key \"" + fileObjKeyName + "\" does not have a file-extension! Setting the \"pdf\"-mimeType.");
|
||||||
|
contentType = "application/pdf";
|
||||||
|
}
|
||||||
|
|
||||||
|
ObjectWriteResponse response;
|
||||||
|
try {
|
||||||
|
response = minioClient.uploadObject(UploadObjectArgs.builder()
|
||||||
|
.bucket(bucketName)
|
||||||
|
.object(fileObjKeyName).filename(fileFullPath)
|
||||||
|
.contentType(contentType).build());
|
||||||
|
|
||||||
|
// TODO - What if the fileObjKeyName already exists?
|
||||||
|
// Right now it gets overwritten (unless we add versioning, which is irrelevant for different objects..)
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Could not upload the file \"" + fileObjKeyName + "\" to the S3 ObjectStore, exception: " + e.getMessage(), e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
String s3Url = endpoint + "/" + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public.
|
||||||
|
//logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url);
|
||||||
|
return s3Url;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean emptyBucket(String bucketName, boolean shouldDeleteBucket)
|
||||||
|
{
|
||||||
|
logger.warn("Going to " + (shouldDeleteBucket ? "delete" : "empty") + " bucket \"" + bucketName + "\"");
|
||||||
|
|
||||||
|
// First list the objects of the bucket.
|
||||||
|
Iterable<Result<Item>> results;
|
||||||
|
try {
|
||||||
|
results = minioClient.listObjects(ListObjectsArgs.builder().bucket(bucketName).build());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Could not retrieve the list of objects of bucket \"" + bucketName + "\"!");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then, delete the objects.
|
||||||
|
for ( Result<Item> resultItem : results ) {
|
||||||
|
try {
|
||||||
|
if ( !deleteFile(resultItem.get().objectName(), bucketName) ) {
|
||||||
|
logger.error("Cannot proceed with bucket deletion, since only an empty bucket can be removed!");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error getting the object from resultItem: " + resultItem.toString() + "\nThe bucket \"" + bucketName + "\" will not be able to be deleted! Exception message: " + e.getMessage());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( shouldDeleteBucket ) {
|
||||||
|
// Lastly, delete the empty bucket.
|
||||||
|
try {
|
||||||
|
minioClient.removeBucket(RemoveBucketArgs.builder().bucket(bucketName).build());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Could not delete the bucket \"" + bucketName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean deleteFile(String fileObjKeyName, String bucketName)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
minioClient.removeObject(RemoveObjectArgs.builder().bucket(bucketName).object(fileObjKeyName).build());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Could not delete the file \"" + fileObjKeyName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue