New feature: BulkImport full-text files from compatible datasources.

2023-05-11 03:07:55 +03:00 · 2023-05-11 03:07:55 +03:00 · b6e8cd1889
parent 42b93e9429
commit b6e8cd1889
16 changed files with 1289 additions and 61 deletions
--- a/build.gradle
+++ b/build.gradle
@ -112,6 +112,10 @@ dependencies {
    // https://mvnrepository.com/artifact/org.json/json
    implementation 'org.json:json:20230227'

+    // https://mvnrepository.com/artifact/com.google.code.gson/gson
+    implementation 'com.google.code.gson:gson:2.10.1'
+
+
    // https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
    runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.10.6'

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -17,6 +17,12 @@ services:
      - type: bind
        source: $HOME/logs
        target: /logs
+      - type: bind
+        source: /mnt/bulkImport
+        target: /mnt/bulkImport
+      - type: bind
+        source: $HOME/bulkImportReports
+        target: /bulkImportReports
    build:
      dockerfile: ./Dockerfile
      context: .
--- a/src/main/java/eu/openaire/urls_controller/Application.java
+++ b/src/main/java/eu/openaire/urls_controller/Application.java
@ -1,5 +1,6 @@
 package eu.openaire.urls_controller;

+import eu.openaire.urls_controller.services.FullTextsServiceImpl;
 import eu.openaire.urls_controller.services.UrlsServiceImpl;
 import eu.openaire.urls_controller.util.FileUtils;
 import eu.openaire.urls_controller.util.UriBuilder;
@ -51,6 +52,8 @@ public class Application {

        shutdownThreads(UrlsServiceImpl.insertsExecutor);
        shutdownThreads(FileUtils.hashMatchingExecutor);
+        shutdownThreads(FullTextsServiceImpl.backgroundExecutor);
+        shutdownThreads(FullTextsServiceImpl.bulkImportExecutor);

        logger.info("Exiting..");
    }
--- a/src/main/java/eu/openaire/urls_controller/components/BulkImport.java
+++ b/src/main/java/eu/openaire/urls_controller/components/BulkImport.java
@ -0,0 +1,111 @@
+package eu.openaire.urls_controller.components;
+
+
+import org.springframework.boot.context.properties.ConfigurationProperties;
+import org.springframework.stereotype.Component;
+
+import java.util.Map;
+
+@Component
+@ConfigurationProperties(prefix = "bulkimport")
+public class BulkImport {
+
+    private String baseBulkImportLocation;
+
+    private String bulkImportReportLocation;
+
+    private Map<String, BulkImportSource> bulkImportSources;
+
+    public BulkImport() {
+    }
+
+    public String getBaseBulkImportLocation() {
+        return baseBulkImportLocation;
+    }
+
+    public void setBaseBulkImportLocation(String baseBulkImportLocation) {
+        this.baseBulkImportLocation = baseBulkImportLocation;
+    }
+
+    public String getBulkImportReportLocation() {
+        return bulkImportReportLocation;
+    }
+
+    public void setBulkImportReportLocation(String bulkImportReportLocation) {
+        this.bulkImportReportLocation = bulkImportReportLocation;
+    }
+
+    public Map<String, BulkImportSource> getBulkImportSources() {
+        return bulkImportSources;
+    }
+
+    public void setBulkImportSources(Map<String, BulkImportSource> bulkImportSources) {
+        this.bulkImportSources = bulkImportSources;
+    }
+
+    @Override
+    public String toString() {
+        return "BulkImport{" +
+                "baseBulkImportLocation='" + baseBulkImportLocation + '\'' +
+                ", bulkImportReportLocation='" + bulkImportReportLocation + '\'' +
+                ", bulkImportSources=" + bulkImportSources +
+                '}';
+    }
+
+
+    public static class BulkImportSource {
+        String datasourceID;
+        String datasourcePrefix;
+        String pdfUrlPrefix;
+        String mimeType;
+
+
+        public BulkImportSource() {
+        }
+
+
+        public String getDatasourceID() {
+            return datasourceID;
+        }
+
+        public void setDatasourceID(String datasourceID) {
+            this.datasourceID = datasourceID;
+        }
+
+        public String getDatasourcePrefix() {
+            return datasourcePrefix;
+        }
+
+        public void setDatasourcePrefix(String datasourcePrefix) {
+            this.datasourcePrefix = datasourcePrefix;
+        }
+
+        public String getPdfUrlPrefix() {
+            return pdfUrlPrefix;
+        }
+
+        public void setPdfUrlPrefix(String pdfUrlPrefix) {
+            this.pdfUrlPrefix = pdfUrlPrefix;
+        }
+
+        public String getMimeType() {
+            return mimeType;
+        }
+
+        public void setMimeType(String mimeType) {
+            this.mimeType = mimeType;
+        }
+
+
+        @Override
+        public String toString() {
+            return "BulkImportSource{" +
+                    "datasourceID='" + datasourceID + '\'' +
+                    ", datasourcePrefix='" + datasourcePrefix + '\'' +
+                    ", pdfUrlPrefix='" + pdfUrlPrefix + '\'' +
+                    ", mimeType='" + mimeType + '\'' +
+                    '}';
+        }
+    }
+
+}
--- a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java
+++ b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java
@ -1,12 +1,18 @@
 package eu.openaire.urls_controller.components;

+import eu.openaire.urls_controller.services.FullTextsServiceImpl;
+import eu.openaire.urls_controller.util.GenericUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-//import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.stereotype.Component;

-import java.text.SimpleDateFormat;
-import java.util.Date;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;


@Component
@ -14,10 +20,40 @@ public class ScheduledTasks {

    private static final Logger logger = LoggerFactory.getLogger(ScheduledTasks.class);

-    private static final SimpleDateFormat dateFormat = new SimpleDateFormat("HH:mm:ss");

-    //@Scheduled(fixedRate = 600_000)   // Run every 10 mins: 600_000
-    public void reportCurrentTime() {
-        logger.info("Server is live! Time is now {}", dateFormat.format(new Date()));
+    @Scheduled(fixedDelay = 3_600_000)  // Execute this method 1 hour after the last execution, in order for some tasks to have been gathered.
+    //@Scheduled(fixedDelay = 20_000)    // Just for testing (every 20 secs).
+    public void executeBackgroundTasks()
+    {
+        List<Callable<Boolean>> tempList = new ArrayList<>(FullTextsServiceImpl.backgroundCallableTasks);   // Copy the list in order to know what was executed and delete only that data later.
+        // So the items added while this execution happens, will be left in the list, while the other will be deleted.
+        int numOfTasks = tempList.size();   // Since the temp-list is a deep-copy and not a reference, new tasks that are added will not be executed.
+        if ( numOfTasks == 0 )
+            return;
+
+        logger.debug(numOfTasks + " background tasks were found inside the \"backgroundCallableTasks\" list and are about to be executed.");
+        // Execute the tasks and wait for them to finish.
+        try {
+            List<Future<Boolean>> futures = FullTextsServiceImpl.backgroundExecutor.invokeAll(tempList);
+            int sizeOfFutures = futures.size();
+            for ( int i = 0; i < sizeOfFutures; ++i ) {
+                try {
+                    Boolean value = futures.get(i).get();	// Get and see if an exception is thrown..
+                    // Add check for the value, if wanted.. (we don't care at the moment)
+                } catch (ExecutionException ee) {
+                    String stackTraceMessage = GenericUtils.getSelectiveStackTrace(ee, null, 15);	// These can be serious errors like an "out of memory exception" (Java HEAP).
+                    logger.error("Task_" + (i+1) + " failed with: " + ee.getMessage() + "\n" + stackTraceMessage);
+                } catch (CancellationException ce) {
+                    logger.error("Task_" + (i+1) + " was cancelled: " + ce.getMessage());
+                } catch (IndexOutOfBoundsException ioobe) {
+                    logger.error("IOOBE for task_" + i + " in the futures-list! " + ioobe.getMessage());
+                } finally {
+                    FullTextsServiceImpl.backgroundCallableTasks.remove(tempList.get(i));   // Remove this object from the global list. Do not use indexes, since they will be different after each deletion and addition.
+                }
+            }
+        } catch (Exception e) {
+            logger.error("", e);
+        }
    }
+
 }
--- a/src/main/java/eu/openaire/urls_controller/controllers/FullTextsController.java
+++ b/src/main/java/eu/openaire/urls_controller/controllers/FullTextsController.java
@ -0,0 +1,204 @@
+package eu.openaire.urls_controller.controllers;
+
+import eu.openaire.urls_controller.components.BulkImport;
+import eu.openaire.urls_controller.models.BulkImportReport;
+import eu.openaire.urls_controller.services.FullTextsService;
+import eu.openaire.urls_controller.services.FullTextsServiceImpl;
+import eu.openaire.urls_controller.util.FileUtils;
+import eu.openaire.urls_controller.util.GenericUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.nio.file.*;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@RestController
+@RequestMapping("")
+public class FullTextsController {
+
+    private static final Logger logger = LoggerFactory.getLogger(FullTextsController.class);
+
+    @Autowired
+    private FileUtils fileUtils;
+
+    private final FullTextsService fullTextsService;
+
+    private final String baseBulkImportLocation;
+
+    private final String bulkImportReportLocation;
+
+    private final HashMap<String, BulkImport.BulkImportSource> bulkImportSources;
+
+    public static final Set<String> bulkImportDirs = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
+
+
+
+    public FullTextsController(FullTextsService fullTextsService, BulkImport bulkImport)
+    {
+        String bulkImportReportLocation1;
+        this.baseBulkImportLocation = bulkImport.getBaseBulkImportLocation();
+
+        this.bulkImportSources = new HashMap<>(bulkImport.getBulkImportSources());
+
+        bulkImportReportLocation1 = bulkImport.getBulkImportReportLocation();
+        if ( !bulkImportReportLocation1.endsWith("/") )
+            bulkImportReportLocation1 += "/";
+        this.bulkImportReportLocation = bulkImportReportLocation1;
+
+        this.fullTextsService = fullTextsService;
+    }
+
+
+    private static final Pattern LAST_DIR_REGEX = Pattern.compile("^.*/([^/]+[/]?)$");
+
+    @GetMapping("bulkImportFullTexts")
+    public ResponseEntity<?> bulkImportFullTexts(@RequestParam String provenance, @RequestParam String bulkImportDir, @RequestParam boolean shouldDeleteFilesOnFinish) {
+
+        BulkImport.BulkImportSource bulkImportSource = bulkImportSources.get(provenance);
+        if ( bulkImportSource == null ) {
+            String errorMsg = "The provided provenance \"" + provenance + "\" is not in the list of the bulk-imported sources, so no configuration-rules are available!";
+            logger.error(errorMsg);
+            return ResponseEntity.badRequest().body(errorMsg);  // It's the user's fault that gave an unsupported provenance.
+        }
+
+        // Check if the given directory parameter exists.
+        if ( bulkImportDir.isEmpty() ) {
+            String errorMsg = "The \"bulkImportDir\" was missing from the request!";
+            logger.error(errorMsg);
+            return ResponseEntity.badRequest().body(errorMsg);
+        }
+
+        String givenBulkDir = bulkImportDir;    // Keep the given value here, to not expose the full-path, in case the user has not provided an absolut path.
+
+        // Make sure the whole path ends with "/", so that we can easily append file-names later.
+        if ( !bulkImportDir.endsWith("/") )
+            bulkImportDir += "/";
+
+        String relativeBulkImportDir = null;
+
+        // Check if we have "relative-path" so that we can append it to the "baseBulkImportLocation".
+        if ( !bulkImportDir.startsWith("/") ) {
+            // A relative path was given.
+            relativeBulkImportDir = bulkImportDir;
+            bulkImportDir = baseBulkImportLocation + bulkImportDir;
+        } else {
+            String errMsg = "The bulkImportDir \"" + bulkImportDir + "\" was problematic!";
+            Matcher matcher = LAST_DIR_REGEX.matcher(bulkImportDir);
+            if ( !matcher.matches() ) {
+                logger.error(errMsg);
+                return ResponseEntity.badRequest().body(errMsg);
+            }
+            relativeBulkImportDir = matcher.group(1);
+            if ( (relativeBulkImportDir == null) || relativeBulkImportDir.isEmpty() ) {
+                logger.error(errMsg);
+                return ResponseEntity.badRequest().body(errMsg);
+            }
+        }
+
+        // The "relativeBulkImportDir" should always be guaranteed to end with "/"! Otherwise, the import-procedure will fail.
+        logger.info("Received a \"bulkImportFullTexts\" request for \"" + provenance + "\" procedure and bulkImportDir: \"" + bulkImportDir + "\".");
+
+        // Check whether the given directory is accessible.
+        File givenDir = new File(bulkImportDir);
+        if ( !givenDir.isDirectory() ) {
+            String errorMsg = "The bulkImportDir \"" + bulkImportDir + "\" is invalid!";
+            logger.error(errorMsg);
+            return ResponseEntity.badRequest().body(errorMsg);
+        }
+
+        // Efficiently check if the dir is empty, without loading all the file-entries in memory.
+        try ( DirectoryStream<Path> directory = Files.newDirectoryStream(givenDir.toPath()) ) {
+            if ( !directory.iterator().hasNext() ) {
+                String errorMsg = "The givenDir \"" + givenDir + "\" is empty!";
+                logger.warn(errorMsg);
+                return ResponseEntity.badRequest().body(errorMsg);
+            }
+            // The above check does not catch the case were the directory has at least one subdirectory, but no full-texts files.
+            // The "iterator()" will have a "next" entry, but no full-text file will exist. Although, that case will be rare.
+        } catch (Exception e) {
+            String errorMsg = "Error when checking if the givenDir \"" + givenDir + "\" is empty!";
+            logger.error(errorMsg);
+            return ResponseEntity.internalServerError().body(errorMsg);
+        }
+
+        // Detect if the same directory is scheduled for being processed. In that case, return a 429.
+        if ( ! bulkImportDirs.add(bulkImportDir) ) {
+            // We allow multiple jobs for the same provenance, running at the same time, but not multiple jobs for the same bulkImportDirectory.
+            String errorMsg = "There is a bulk-import request for the directory \"" + bulkImportDir + "\" that is being handled at the moment. Please wait until it's finished being processed, before making another request.";
+            logger.error(errorMsg);
+            return ResponseEntity.status(HttpStatus.TOO_MANY_REQUESTS).body(errorMsg);
+        }
+
+        Path currentBulkImportReportLocationDir = Paths.get(this.bulkImportReportLocation, provenance);
+        try {
+            Files.createDirectories(currentBulkImportReportLocationDir);    // No-op if dir exists. It does not throw a "alreadyExistsException"
+        } catch (Exception e) {
+            String errorMsg = "Could nor create the \"bulkImportReportLocation\" for provenance \"" + provenance + "\" : " + currentBulkImportReportLocationDir;
+            logger.error(errorMsg, e);
+            return ResponseEntity.internalServerError().body(errorMsg);
+        }
+
+        String bulkImportReportID = provenance + "/" + relativeBulkImportDir.substring(0, (relativeBulkImportDir.length() -1)) + "_report_" + GenericUtils.getRandomNumber(10000, 99999);
+        String bulkImportReportFullPath = this.bulkImportReportLocation + bulkImportReportID + ".json";
+
+        String msg = "The 'bulkImportFullTexts' request for '" + provenance + "' procedure and bulkImportDir: '" + givenBulkDir + "' was accepted and will be scheduled for execution. "
+                + (shouldDeleteFilesOnFinish ? "The successfully imported files will be deleted." : "All files will remain inside the directory after processing.")
+                + " You can request a report at any moment, using this reportFileID:  " + bulkImportReportID;
+
+        BulkImportReport bulkImportReport = new BulkImportReport(provenance, bulkImportReportFullPath, bulkImportReportID);
+        bulkImportReport.addEvent(msg);
+
+        String errorMsg = fileUtils.writeToFile(bulkImportReportFullPath, bulkImportReport.getJsonReport());
+        if ( errorMsg != null )
+            return ResponseEntity.internalServerError().body(errorMsg);
+
+        logger.info(msg);
+
+        // Add this to a background job, since it will take a lot of time to be completed, and the caller will get a "read-timeout" at least and a socket-timeout at most (in case of a network failure during those hours).
+        String finalBulkImportDir = bulkImportDir;
+        String finalRelativeBulkImportDir = relativeBulkImportDir;
+        FullTextsServiceImpl.backgroundCallableTasks.add(() ->
+            fullTextsService.bulkImportFullTextsFromDirectory(bulkImportReport, finalRelativeBulkImportDir, finalBulkImportDir, givenDir, provenance, bulkImportSource, shouldDeleteFilesOnFinish)
+        );
+
+        return ResponseEntity.ok().body(msg);
+    }
+
+
+    @GetMapping("getBulkImportReport")
+    public ResponseEntity<?> getBulkImportReport(@RequestParam("id") String bulkImportReportId)
+    {
+        // Write the contents of the report-file to a string (efficiently!) and return the whole content as an HTTP-response.
+        StringBuilder stringBuilder = new StringBuilder(2_000);
+        String line;
+        try ( BufferedReader in = new BufferedReader(new InputStreamReader(Files.newInputStream(Paths.get(this.bulkImportReportLocation, bulkImportReportId + ".json"))), FileUtils.tenMb) ) {
+            while ( (line = in.readLine()) != null )
+                stringBuilder.append(line).append("\n");    // The "readLine()" does not return the line-term char.
+        } catch (NoSuchFileException nsfe) {
+            logger.warn("The requested report-file with ID: \"" + bulkImportReportId + "\" was not found!");
+            return ResponseEntity.notFound().build();
+        } catch (Exception e) {
+            String errorMsg = "Failed to read the contents of report-file with ID: " + bulkImportReportId;
+            logger.error(errorMsg, e);
+            return ResponseEntity.internalServerError().body(errorMsg); // It's ok to give the file-path to the user, since the report already contains the file-path.
+        }
+
+        return ResponseEntity.ok().body(stringBuilder.toString());
+    }
+
+}
--- a/src/main/java/eu/openaire/urls_controller/models/BulkImportReport.java
+++ b/src/main/java/eu/openaire/urls_controller/models/BulkImportReport.java
@ -0,0 +1,105 @@
+package eu.openaire.urls_controller.models;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimaps;
+import com.google.common.collect.SetMultimap;
+import com.google.gson.Gson;
+import eu.openaire.urls_controller.util.GenericUtils;
+
+import java.util.Collection;
+import java.util.Map;
+
+
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class BulkImportReport {
+
+    @JsonProperty
+    private String provenance;
+
+    @JsonProperty
+    private String reportLocation;
+
+    @JsonProperty
+    private String reportID;
+
+    // This will not be serialized, since Gson cannot serialize Multimaps. Instead, it will be converted to the "simpler" map below.
+    transient private SetMultimap<String, String> eventsMultimap = Multimaps.synchronizedSetMultimap(LinkedHashMultimap.create());
+    // We need a "LinkedHashMultimap", se that the order of the keys (timestamps) stay ascending, so the final report makes sense in chronological order.
+    // We need for one key (timestamp) to have multiple values (events), in order to not lose events happening at the same time.
+
+    @JsonProperty
+    private Map<String, Collection<String>> eventsMap;
+
+
+    public BulkImportReport(String provenance, String reportLocation, String reportID) {
+        this.provenance = provenance;
+        this.reportLocation = reportLocation;
+        this.reportID = reportID;
+    }
+
+
+    public void addEvent(String event) {
+        eventsMultimap.put(GenericUtils.getReadableCurrentTimeAndZone(), event);
+    }
+
+    public String getJsonReport()
+    {
+        //Convert the LinkedHashMultiMap<String, String> to Map<String, Collection<String>>, since Gson cannot serialize Multimaps.
+        eventsMap = eventsMultimap.asMap();
+        return new Gson().toJson(this);
+    }
+
+    public String getProvenance() {
+        return provenance;
+    }
+
+    public void setProvenance(String provenance) {
+        this.provenance = provenance;
+    }
+
+    public String getReportLocation() {
+        return reportLocation;
+    }
+
+    public void setReportLocation(String reportLocation) {
+        this.reportLocation = reportLocation;
+    }
+
+    public String getReportID() {
+        return reportID;
+    }
+
+    public void setReportID(String reportID) {
+        this.reportID = reportID;
+    }
+
+    public SetMultimap<String, String> getEventsMultimap() {
+        return eventsMultimap;
+    }
+
+    public void setEventsMultimap(SetMultimap<String, String> eventsMultimap) {
+        this.eventsMultimap = eventsMultimap;
+    }
+
+    public Map<String, Collection<String>> getEventsMap() {
+        return eventsMap;
+    }
+
+    public void setEventsMap(Map<String, Collection<String>> eventsMap) {
+        this.eventsMap = eventsMap;
+    }
+
+    @Override
+    public String toString() {
+        return "BulkImportReport{" +
+                "provenance='" + provenance + '\'' +
+                ", reportLocation='" + reportLocation + '\'' +
+                ", reportID='" + reportID + '\'' +
+                ", eventsMultimap=" + eventsMultimap +
+                ", eventsMap=" + eventsMap +
+                '}';
+    }
+
+}
--- a/src/main/java/eu/openaire/urls_controller/models/DocFileData.java
+++ b/src/main/java/eu/openaire/urls_controller/models/DocFileData.java
@ -0,0 +1,115 @@
+package eu.openaire.urls_controller.models;
+
+import com.google.common.hash.Hashing;
+import com.google.common.io.Files;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.nio.file.Paths;
+
+public class DocFileData {
+
+    private static final Logger logger = LoggerFactory.getLogger(DocFileData.class);
+
+    private File docFile;
+    private String hash;
+    private Long size;
+    private String location;
+
+    private FileOutputStream fileOutputStream;
+
+
+    public DocFileData(File docFile, String hash, Long size, String location, FileOutputStream fileOutputStream) {
+        this.docFile = docFile;
+        this.hash = hash;
+        this.size = size;
+        this.location = location;
+        this.fileOutputStream = fileOutputStream;
+    }
+
+
+    public DocFileData(File docFile, String hash, Long size, String location) {
+        this.docFile = docFile;
+        this.hash = hash;
+        this.size = size;
+        this.location = location;
+    }
+
+    public DocFileData(File docFile, FileOutputStream fileOutputStream) {
+        this.docFile = docFile;
+        this.fileOutputStream = fileOutputStream;
+    }
+
+    public File getDocFile() {
+        return docFile;
+    }
+
+    public void setDocFile(File docFile) {
+        this.docFile = docFile;
+    }
+
+    public String getHash() {
+        return hash;
+    }
+
+    public void setHash(String hash) {
+        this.hash = hash;
+    }
+
+    public Long getSize() {
+        return size;
+    }
+
+    public void setSize(Long size) {
+        this.size = size;
+    }
+
+    /**
+     * Set this as a separate method (not automatically applied in the contractor), in order to avoid long thread-blocking in the caller method, which downloads and constructs this object inside a synchronized block.
+     * */
+    public void calculateAndSetHashAndSize() {
+        if ( this.docFile == null ) {  // Verify the "docFile" is already set, otherwise we get an NPE.
+            logger.warn("The \"docFile\" was not previously set!");
+            return;
+        }
+
+        String fileLocation = this.docFile.getAbsolutePath();
+        try {
+            this.hash = Files.asByteSource(this.docFile).hash(Hashing.md5()).toString();	// These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
+            //logger.debug("MD5 for file \"" + docFile.getName() + "\": " + this.hash); // DEBUG!
+            this.size = java.nio.file.Files.size(Paths.get(fileLocation));
+            //logger.debug("Size of file \"" + docFile.getName() + "\": " + this.size); // DEBUG!
+        } catch (Exception e) {
+            logger.error("Could not retrieve the size " + ((this.hash == null) ? "and the MD5-hash " : "") + "of the file: " + fileLocation, e);
+        }
+    }
+
+    public String getLocation() {
+        return location;
+    }
+
+    public void setLocation(String location) {
+        this.location = location;
+    }
+
+    public FileOutputStream getFileOutputStream() {
+        return fileOutputStream;
+    }
+
+    public void setFileOutputStream(FileOutputStream fileOutputStream) {
+        this.fileOutputStream = fileOutputStream;
+    }
+
+    @Override
+    public String toString() {
+        return "DocFileData{" +
+                "docFile=" + docFile +
+                ", hash='" + hash + '\'' +
+                ", size=" + size +
+                ", location='" + location + '\'' +
+                ", fileOutputStream=" + fileOutputStream +
+                '}';
+    }
+}
--- a/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java
+++ b/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java
@ -0,0 +1,89 @@
+package eu.openaire.urls_controller.models;
+
+import eu.openaire.urls_controller.util.FileUtils;
+
+import java.util.regex.Matcher;
+
+public class FileLocationData {
+
+    String fileDir;
+
+    String fileName;
+
+    String filenameWithoutExtension;
+
+    String fileNameID;
+
+    String dotFileExtension;
+
+    public FileLocationData(String fileLocation) throws RuntimeException {
+        // Extract and set LocationData.
+        Matcher matcher = FileUtils.FILENAME_ID_EXTENSION.matcher(fileLocation);
+        if ( !matcher.matches() )
+            throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILENAME_ID_EXTENSION);
+        fileDir = matcher.group(1);
+        if ( (fileDir == null) || fileDir.isEmpty() )
+            throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\".");
+        fileName = matcher.group(2);
+        if ( (fileName == null) || fileName.isEmpty() )
+            throw new RuntimeException("Failed to extract the \"fileName\" from \"" + fileLocation + "\".");
+        // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
+        fileNameID = matcher.group(4);
+        if ( (fileNameID == null) || fileNameID.isEmpty() )
+            throw new RuntimeException("Failed to extract the \"fileNameID\" from \"" + fileLocation + "\".");
+        dotFileExtension = matcher.group(5);
+        if ( (dotFileExtension == null) || dotFileExtension.isEmpty() )
+            throw new RuntimeException("Failed to extract the \"dotFileExtension\" from \"" + fileLocation + "\".");
+    }
+
+    public String getFileDir() {
+        return fileDir;
+    }
+
+    public void setFileDir(String fileDir) {
+        this.fileDir = fileDir;
+    }
+
+    public String getFileName() {
+        return fileName;
+    }
+
+    public void setFileName(String fileName) {
+        this.fileName = fileName;
+    }
+
+    public String getFilenameWithoutExtension() {
+        return filenameWithoutExtension;
+    }
+
+    public void setFilenameWithoutExtension(String filenameWithoutExtension) {
+        this.filenameWithoutExtension = filenameWithoutExtension;
+    }
+
+    public String getFileNameID() {
+        return fileNameID;
+    }
+
+    public void setFileNameID(String fileNameID) {
+        this.fileNameID = fileNameID;
+    }
+
+    public String getDotFileExtension() {
+        return dotFileExtension;
+    }
+
+    public void setDotFileExtension(String dotFileExtension) {
+        this.dotFileExtension = dotFileExtension;
+    }
+
+    @Override
+    public String toString() {
+        return "FileLocationData{" +
+                "fileDir='" + fileDir + '\'' +
+                ", fileName='" + fileName + '\'' +
+                ", filenameWithoutExtension='" + filenameWithoutExtension + '\'' +
+                ", fileNameID='" + fileNameID + '\'' +
+                ", dotFileExtension='" + dotFileExtension + '\'' +
+                '}';
+    }
+}
--- a/src/main/java/eu/openaire/urls_controller/services/FullTextsService.java
+++ b/src/main/java/eu/openaire/urls_controller/services/FullTextsService.java
@ -0,0 +1,18 @@
+package eu.openaire.urls_controller.services;
+
+import eu.openaire.urls_controller.components.BulkImport;
+import eu.openaire.urls_controller.models.BulkImportReport;
+
+import java.io.File;
+import java.util.List;
+
+public interface FullTextsService {
+
+
+    Boolean bulkImportFullTextsFromDirectory(BulkImportReport bulkImportReport, String relativeBulkImportDir, String bulkImportDirName, File bulkImportDir, String provenance, BulkImport.BulkImportSource bulkImportSource, boolean shouldDeleteFilesOnFinish);
+
+    List<String> getFileLocationsInsideDir(String directory);
+
+    String getMD5hash(String string);
+
+}
--- a/src/main/java/eu/openaire/urls_controller/services/FullTextsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/FullTextsServiceImpl.java
@ -0,0 +1,443 @@
+package eu.openaire.urls_controller.services;
+
+
+import com.google.common.collect.Lists;
+import eu.openaire.urls_controller.components.BulkImport;
+import eu.openaire.urls_controller.configuration.ImpalaConnector;
+import eu.openaire.urls_controller.controllers.FullTextsController;
+import eu.openaire.urls_controller.models.BulkImportReport;
+import eu.openaire.urls_controller.models.DocFileData;
+import eu.openaire.urls_controller.models.FileLocationData;
+import eu.openaire.urls_controller.util.FileUtils;
+import eu.openaire.urls_controller.util.GenericUtils;
+import eu.openaire.urls_controller.util.ParquetFileUtils;
+import org.apache.avro.generic.GenericData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.dao.EmptyResultDataAccessException;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.stereotype.Service;
+
+import javax.xml.bind.DatatypeConverter;
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.security.MessageDigest;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+
+@Service
+public class FullTextsServiceImpl implements FullTextsService {
+
+    private static final Logger logger = LoggerFactory.getLogger(FullTextsServiceImpl.class);
+
+
+    @Autowired
+    private FileUtils fileUtils;
+
+
+    @Autowired
+    private ParquetFileUtils parquetFileUtils;
+
+    @Autowired
+    private JdbcTemplate jdbcTemplate;
+
+    public static final ExecutorService backgroundExecutor = Executors.newFixedThreadPool(2);   // At most 2 threads will be used.
+
+    public static final List<Callable<Boolean>> backgroundCallableTasks = Collections.synchronizedList(new ArrayList<>());
+
+
+    private static final int numOfBulkImportThreads = 4;
+    public static final ExecutorService bulkImportExecutor = Executors.newFixedThreadPool(numOfBulkImportThreads);   // At most 4 threads will be used.
+
+
+    /**
+     * Given a directory with full-text-files, this method imports the full-texts files in the PDF Aggregation Service.
+     * Also, it provides the guarantee that the failed files will not be deleted! A file can "fail" if any of the expected results fail (upload-to-S3, parquet-creation and upload, load-to-db, ect)
+     * */
+    public Boolean bulkImportFullTextsFromDirectory(BulkImportReport bulkImportReport, String relativeBulkImportDir, String bulkImportDirName, File bulkImportDir, String provenance, BulkImport.BulkImportSource bulkImportSource, boolean shouldDeleteFilesOnFinish)
+    {
+        String bulkImportReportLocation = bulkImportReport.getReportLocation();
+
+        // Write to bulkImport-report file.
+        bulkImportReport.addEvent("Initializing the bulkImport '" + provenance + "' procedure with bulkImportDir '" + bulkImportDirName + "'.");
+        // Do not write immediately to the file, wait for the following checks.
+
+        if ( (ParquetFileUtils.payloadsSchema == null)   // Parse the schema if it's not already parsed.
+                && ((ParquetFileUtils.payloadsSchema = ParquetFileUtils.parseSchema(ParquetFileUtils.payloadSchemaFilePath)) == null ) ) {
+            String errorMsg = "The 'payloadsSchema' could not be parsed!";
+            logger.error(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+
+        List<String> fileLocations = getFileLocationsInsideDir(bulkImportDirName);
+        if ( fileLocations == null ) {
+            bulkImportReport.addEvent("Could not retrieve the files for bulk-import!");
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+
+        int numOfFiles = fileLocations.size();
+        if ( numOfFiles == 0 ) {
+            String errorMsg = "No files were found inside the bulkImportDir: " + bulkImportDirName;
+            logger.warn(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+
+        logger.trace("fileLocations:\n" + fileLocations);
+
+        String localParquetDir = parquetFileUtils.parquetBaseLocalDirectoryPath + "bulk_import_" + provenance + File.separator + relativeBulkImportDir; // This ends with "/".
+        try {
+            Files.createDirectories(Paths.get(localParquetDir)); // No-op if it already exists.
+        } catch (Exception e) {
+            String errorMsg = "Could not create the local parquet-directory: " + localParquetDir;
+            logger.error(errorMsg, e);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+
+        // Create a new directory on HDFS, with this bulkImportDir name. So, that there will not be any "load data" operation to fail because another thread has loaded that base-dir right before.
+        String currentBulkImportHdfsDir = parquetFileUtils.parquetHDFSDirectoryPathPayloadsBulkImport + relativeBulkImportDir;
+        if ( ! parquetFileUtils.applyHDFOperation(parquetFileUtils.webHDFSBaseUrl + currentBulkImportHdfsDir + parquetFileUtils.mkDirsAndParams) ) { // N0-op if it already exists. It is very quick.
+            String errorMsg = "Could not create the hdfs-directory: " + currentBulkImportHdfsDir;
+            logger.error(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+
+        long timeMillis = System.currentTimeMillis();   // Store it here, in order to have the same for all current records.
+
+        List<Callable<Integer>> callables = new ArrayList<>(numOfFiles);
+        List<List<String>> subLists = Lists.partition(fileLocations, numOfBulkImportThreads);   // Divide the initial list to "numOfBulkImportThreads" subLists. The last one may have marginally fewer files.
+        int subListsSize = subLists.size();
+
+        bulkImportReport.addEvent("Going to import the files in " + subListsSize + " segments, in parallel.");
+        fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+
+        for ( int i = 0; i < subListsSize; ++i ) {
+            int finalI = i;
+            callables.add(() -> {   // Handle inserts to the "attempt" table. Insert 20% of the "attempt" queries.
+                return processBulkImportedFilesSegment(bulkImportReport, finalI, subLists.get(finalI), bulkImportDirName, localParquetDir, currentBulkImportHdfsDir, provenance, bulkImportSource, timeMillis, shouldDeleteFilesOnFinish);
+            });
+        }
+
+        int numFailedSegments = 0;
+        int numFailedFiles = 0;
+        try {
+            List<Future<Integer>> futures = bulkImportExecutor.invokeAll(callables);    // This waits for all tasks to finish.
+            int sizeOfFutures = futures.size();
+            for ( int i = 0; i < sizeOfFutures; ++i ) {
+                try {
+                    numFailedFiles += futures.get(i).get();
+                    if ( numFailedFiles == subLists.get(i).size() ) {   // Get and see if it was successfully or not, or if an exception is thrown..
+                        numFailedSegments++;
+                    }
+                    // In case all the files failed to be bulk-imported, then we will detect it in the "numSuccessfulSegments"-check later.
+                    // The failed-to-be-imported files, will not be deleted, even if the user specifies that he wants to delete the directory.
+                } catch (ExecutionException ee) {
+                    String stackTraceMessage = GenericUtils.getSelectiveStackTrace(ee, null, 15);	// These can be serious errors like an "out of memory exception" (Java HEAP).
+                    logger.error("Task_" + (i+1) + " failed with: " + ee.getMessage() + "\n" + stackTraceMessage);
+                } catch (CancellationException ce) {
+                    logger.error("Task_" + (i+1) + " was cancelled: " + ce.getMessage());
+                } catch (IndexOutOfBoundsException ioobe) {
+                    logger.error("IOOBE for task_" + i + " in the futures-list! " + ioobe.getMessage());
+                }
+            }
+        } catch (Exception e) {
+            String errorMsg = "An error occurred when trying to bulk-import data from bulkImportDir: " + bulkImportDirName;
+            logger.error(errorMsg, e);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        } finally {
+            logger.debug("Deleting local parquet directory: " + localParquetDir);
+            fileUtils.deleteDirectory(new File(localParquetDir));   // It may not exist at all, if none of the parquet files were created.
+        }
+
+        // Check the results.
+        String msg;
+        if ( numFailedFiles == numOfFiles ) {
+            String errorMsg = "None of the files inside the bulkImportDir '" + bulkImportDirName + "' were imported!";
+            logger.error(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        } else if ( numFailedFiles > 0 ) {    // Some failed, but not all.
+            msg = numFailedFiles + " files" + (numFailedSegments > 0 ? (" and " + numFailedSegments + " whole segments") : "") + " failed to be bulk-imported, from the bulkImportDir: " + bulkImportDirName;
+            logger.warn(msg);
+        } else {
+            msg = "All " + numOfFiles + " files, from bulkImportDir '" + bulkImportDirName + "' were bulkImported successfully.";
+            logger.info(msg);
+        }
+        bulkImportReport.addEvent(msg);
+        fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+
+        // Merge the parquet files inside the table "payload_bulk_import", to improve performance of future operations.
+        ImpalaConnector.databaseLock.lock();
+        String mergeErrorMsg = fileUtils.mergeParquetFiles("payload_bulk_import", "", null);
+        if ( mergeErrorMsg != null ) {
+            ImpalaConnector.databaseLock.unlock();
+            bulkImportReport.addEvent(mergeErrorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+            return false;
+        }
+        ImpalaConnector.databaseLock.unlock();
+
+        String successMsg = "Finished the bulk-import procedure for '" + provenance + "' and bulkImportDir: " + bulkImportDirName;
+        logger.info(successMsg);
+        bulkImportReport.addEvent(successMsg);
+        fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+
+        FullTextsController.bulkImportDirs.remove(bulkImportDirName);
+        return true;
+    }
+
+
+    private int processBulkImportedFilesSegment(BulkImportReport bulkImportReport, int segmentCounter, List<String> fileLocationsSegment, String bulkImportDirName, String localParquetDir, String currentBulkImportHdfsDir,
+                                                    String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, boolean shouldDeleteFilesOnFinish)
+    {
+        // Inside this thread, process a segment of the files.
+        String bulkImportReportLocation = bulkImportReport.getReportLocation();
+
+        int numOfFilesInSegment = fileLocationsSegment.size();
+        String msg = "Going to import " + numOfFilesInSegment + " files for segment-" + segmentCounter + " , of bulkImport procedure '" + provenance + "' | dir: '" + bulkImportDirName + "'..";
+        logger.debug(msg);
+        bulkImportReport.addEvent(msg);
+
+        List<GenericData.Record> payloadRecords = new ArrayList<>(numOfFilesInSegment);
+
+        // Use a HashSet for the failed files, in order to not remove them in the end.
+        HashSet<String> failedFiles = new HashSet<>();
+        int counter = 0;
+
+        // Upload files to S3 and collect payloadRecords.
+        for ( String fileLocation: fileLocationsSegment ) {
+            GenericData.Record record = processBulkImportedFile(fileLocation, provenance, bulkImportSource, timeMillis);
+            if ( record != null )
+                payloadRecords.add(record);
+            else {
+                bulkImportReport.addEvent("An error caused the file: '" + fileLocation + "' to not be imported!");
+                failedFiles.add(fileLocation);
+            }
+
+            if ( ((++counter) % 100) == 0 ) {   // Every 100 files, report the status.
+                bulkImportReport.addEvent("Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.");
+                fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            }
+        }
+
+        int numOfPayloadRecords = payloadRecords.size();
+        if ( numOfPayloadRecords == 0 ) {
+            String errorMsg = "No payload-records were generated for any of the files inside the bulkImportDir: " + bulkImportDirName;
+            logger.warn(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            // None of the files of this segment will be deleted, in any case.
+            return numOfFilesInSegment;
+        } else if ( numOfPayloadRecords != numOfFilesInSegment ) {
+            // Write this important note here, in order to certainly be in the report, even if a parquet-file failure happens and the method exists early.
+            String errorMsg = failedFiles.size() + " out of " + numOfFilesInSegment + " files failed to be imported, for segment-" + segmentCounter + "!";
+            logger.warn(errorMsg);
+            bulkImportReport.addEvent(errorMsg);
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+        }
+
+        // Construct the parquet file, upload it to HDFS and load them it in the "payload_bulk_import" table.
+        String parquetFileName = "payloads_" + segmentCounter + ".parquet";
+        String fullLocalParquetFilePath = localParquetDir + parquetFileName;
+        logger.trace("Going to write " + numOfPayloadRecords + " payload-records to the parquet file: " + fullLocalParquetFilePath);    // DEBUG!
+
+        if ( ! parquetFileUtils.writeToParquet(payloadRecords, ParquetFileUtils.payloadsSchema, fullLocalParquetFilePath) ) {
+            bulkImportReport.addEvent("Could not write the payload-records to the parquet-file: '" + parquetFileName + "'!");
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            // None of the files of this segment will be deleted, in any case.
+            return numOfFilesInSegment;
+        }
+        //logger.trace("Parquet file '" + parquetFileName + "' was created and filled.");    // DEBUG!
+
+        // Upload and insert the data to the "payload" Impala table. (no database-locking is required)
+        String errorMsg = parquetFileUtils.uploadParquetFileToHDFS(fullLocalParquetFilePath, parquetFileName, currentBulkImportHdfsDir);
+        if ( errorMsg != null ) {   // The possible error-message returned, is already logged by the Controller.
+            bulkImportReport.addEvent("Could not upload the parquet-file '" + parquetFileName + "' to HDFS!");
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            // None of the files of this segment will be deleted, in any case.
+            return numOfFilesInSegment;
+        }
+
+        ImpalaConnector.databaseLock.lock();
+        if ( !parquetFileUtils.loadParquetDataIntoTable((currentBulkImportHdfsDir + parquetFileName), "payload_bulk_import") ) {
+            ImpalaConnector.databaseLock.unlock();
+            bulkImportReport.addEvent("Could not load the payload-records to the database!");
+            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport());
+            // None of the files of this segment will be deleted, in any case.
+            return numOfFilesInSegment;
+        }
+        ImpalaConnector.databaseLock.unlock();
+
+        String segmentSuccessMsg = "Finished importing " + numOfPayloadRecords + " files, out of " + numOfFilesInSegment + " , for segment-" + segmentCounter + ".";
+        logger.info(segmentSuccessMsg);
+        bulkImportReport.addEvent(segmentSuccessMsg);
+
+        if ( shouldDeleteFilesOnFinish ) {
+            segmentSuccessMsg = "As the user requested, the successfully imported files of '" + provenance + "' procedure, of bulk-import segment-" + segmentCounter + ", from directory '" + bulkImportDirName + "', will be deleted.";
+            logger.info(segmentSuccessMsg);
+            bulkImportReport.addEvent(segmentSuccessMsg);
+
+            // Delete all files except the ones in the "failedHashSet"
+            for ( String fileLocation : fileLocationsSegment ) {
+                if ( !failedFiles.contains(fileLocation) )
+                    if ( !fileUtils.deleteFile(fileLocation) )
+                        bulkImportReport.addEvent("The file '" + fileLocation + "' could not be deleted! Please make sure you have provided the WRITE-permission.");
+            }
+        }
+
+        return (numOfFilesInSegment - numOfPayloadRecords); // Return the numOfFailedFiles.
+    }
+
+
+    private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis)
+    {
+        File fullTextFile = new File(fileLocation);
+        DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
+        docFileData.calculateAndSetHashAndSize();
+
+        // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
+        // Also, it may be the case that this file was downloaded by another datasource.
+
+        String fileHash = docFileData.getHash();
+        if ( fileHash == null )
+            return null;    // No check of past found full-text can be made nor the S3-fileName can be created.
+
+        FileLocationData fileLocationData;
+        try {
+            fileLocationData = new FileLocationData(fileLocation);
+        } catch (RuntimeException re) {
+            logger.error(re.getMessage());
+            return null;
+        }
+
+        String datasourceId = bulkImportSource.getDatasourceID();
+        String datasourcePrefix = bulkImportSource.getDatasourcePrefix();
+        String fileNameID = fileLocationData.getFileNameID();
+
+        String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This is the urls with the ArvixId.
+        String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.
+
+        final String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ? limit 1";
+        final int[] hashArgType = new int[] {Types.VARCHAR};
+        String alreadyFoundFileLocation = null;
+        ImpalaConnector.databaseLock.lock();
+        try {
+            alreadyFoundFileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[]{fileHash}, hashArgType, String.class);
+        } catch (EmptyResultDataAccessException erdae) {
+            // No fileLocation is found, it's ok. It will be null by default.
+        } catch (Exception e) {
+            logger.error("Error when executing or acquiring data from the the 'getFileLocationForHashQuery'!\n", e);
+            // Continue with bulk-importing the file and uploading it to S3.
+        } finally {
+            ImpalaConnector.databaseLock.unlock();
+        }
+
+        String idMd5hash = getMD5hash(fileNameID.toLowerCase());
+        if ( idMd5hash == null )
+            return null;
+
+        // openaire id = <datasourcePrefix> + "::" + <md5(lowercase(arxivId))>
+        String openAireId = (datasourcePrefix + "::" + idMd5hash);
+        //logger.trace("openAireId: " + openAireId);
+
+        String s3Url = null;
+
+        if ( alreadyFoundFileLocation != null ) // If the full-text of this record is already-found and uploaded.
+        {
+            // This full-text was found to already be in the database.
+            // If it has the same datasourceID, then it likely was crawled before from an ID belonging to this datasource.
+                // If also has the same ID, then the exact same record from that datasource was retrieved previously.
+                // Else, the file was downloaded by another record of this datasource.
+            // ELse if the datasourceID is not the same, then the same file was retrieved from another datasource.
+            // The above analysis is educational, it does not need to take place and is not currently used.
+
+            s3Url = alreadyFoundFileLocation;
+        } else {
+            try {
+                s3Url = fileUtils.constructFileNameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);  // This throws Exception, in case the uploading failed.
+                if ( s3Url == null )
+                    return null;    // In case the 'datasourceID' or 'hash' is null. Which should never happen here, since both of them are checked before the execution reaches here.
+            } catch (Exception e) {
+                logger.error("Could not upload the file '" + fileLocationData.getFileName() + "' to the S3 ObjectStore!", e);
+                return null;
+            }
+        }
+
+        GenericData.Record record = new GenericData.Record(ParquetFileUtils.payloadsSchema);
+        record.put("id", openAireId);
+        record.put("original_url", originalUrl);
+        record.put("actual_url", actualUrl);
+        record.put("date", timeMillis);
+        record.put("mimetype", bulkImportSource.getMimeType());
+        Long size = docFileData.getSize();
+        record.put("size", ((size != null) ? String.valueOf(size) : null));
+        record.put("hash", fileHash);  // This is already checked and will not be null here.
+        record.put("location", s3Url);
+        record.put("provenance", provenance);
+
+        return record;
+    }
+
+
+    public List<String> getFileLocationsInsideDir(String directory)
+    {
+        List<String> fileLocations = null;
+
+        try ( Stream<Path> walkStream = Files.find(Paths.get(directory), Integer.MAX_VALUE, (filePath, fileAttr) -> fileAttr.isRegularFile()) )
+        // In case we ever include other type-of-Files inside the same directory, we need to add this filter: "&& !filePath.toString().endsWith("name.ext")"
+        {
+            fileLocations = walkStream.map(Path::toString).collect(Collectors.toList());
+        } catch (Exception e) {
+            String errorMsg = "Could not retrieve the files from directory: '" + directory + "'!";
+            logger.error(errorMsg, e);
+            return null;
+        }
+
+        return fileLocations;
+    }
+
+
+    public String getMD5hash(String string)
+    {
+        String md5 = null;
+        try {
+            MessageDigest md5MD = MessageDigest.getInstance("MD5"); // New instance for any new request. Otherwise, we need to synchronize the use of that object among multiple threads.
+            md5MD.update(string.getBytes());
+            md5 = DatatypeConverter.printHexBinary(md5MD.digest()).toLowerCase();
+        } catch (Exception e) {
+            logger.error("Error when getting the MD5-hash for: " + string, e);
+            return null;
+        }
+        return md5;
+    }
+
+}
--- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java
@ -1,5 +1,6 @@
 package eu.openaire.urls_controller.services;

+import eu.openaire.urls_controller.components.BulkImport;
 import eu.openaire.urls_controller.configuration.ImpalaConnector;
 import eu.openaire.urls_controller.models.*;
 import eu.openaire.urls_controller.payloads.responces.AssignmentsResponse;
@ -23,6 +24,7 @@ import java.sql.PreparedStatement;
 import java.sql.SQLException;
 import java.sql.Timestamp;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@ -55,19 +57,30 @@ public class UrlsServiceImpl implements UrlsService {
    public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);


-    public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord,
-                           @Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List<String> excludedIDs) {
+    public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord, BulkImport bulkImport)
+    {
        maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);

-        // The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file.
-        // In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty.
-        int exclusionListSize = excludedIDs.size();
-        if ( exclusionListSize == 0 )
+        HashMap<String, BulkImport.BulkImportSource> bulkImportSources = new HashMap<>(bulkImport.getBulkImportSources());
+        // The "bulkImportSources" will not be null, as it will be defined inside the "application.yml" file.
+        // In case no bulkImport Datasources are given, then the "bulkImportSources" list will just be empty.
+        if ( bulkImportSources.isEmpty() )
            return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method.

+        logger.trace("BulkImportSources:\n" + bulkImportSources);
+
+        List<String> excludedIDs = new ArrayList<>();
+        for ( BulkImport.BulkImportSource source : bulkImportSources.values() ) {
+            String datasourceID = source.getDatasourceID();
+            if ( (datasourceID == null) || datasourceID.isEmpty() )
+                throw new RuntimeException("One of the bulk-imported datasourceIDs was not found! | source: " + source);
+            excludedIDs.add(datasourceID);
+        }
+
+        int exclusionListSize = excludedIDs.size(); // This list will not be empty.
+
        // Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern:
        // ("ID_1", "ID_2", ...)
-
        final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 );
        sb.append("(");
        for ( int i=0; i < exclusionListSize; ++i ) {
@ -78,7 +91,7 @@ public class UrlsServiceImpl implements UrlsService {
        sb.append(")");

        excludedDatasourceIDsStringList = sb.toString();
-        logger.info("The following datasources will be excluded from crawling: " + excludedDatasourceIDsStringList);
+        logger.info("The following bulkImport-datasources will be excluded from crawling: " + excludedDatasourceIDsStringList);
    }


@ -302,7 +315,7 @@ public class UrlsServiceImpl implements UrlsService {
            return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
        } finally {
            logger.debug("Deleting parquet directory: " + currentParquetPath);
-            FileUtils.deleteDirectory(new File(currentParquetPath));
+            fileUtils.deleteDirectory(new File(currentParquetPath));
        }

        logger.debug("Going to merge the parquet files for the tables which were altered.");
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -6,6 +6,7 @@ import com.google.common.collect.SetMultimap;
 import eu.openaire.urls_controller.configuration.ImpalaConnector;
 import eu.openaire.urls_controller.models.Payload;
 import eu.openaire.urls_controller.models.UrlReport;
+import org.apache.commons.io.FileDeleteStrategy;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
@ -31,6 +32,8 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -109,7 +112,7 @@ public class FileUtils {

    // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension".
    // Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
-    private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:[^.()]+/)?((([^./()]+)[^./]*)(\\.[\\w]{2,10}))$");
+    public static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:([^.()]+)/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");

    private static final int numOfFullTextsPerBatch = 70;   // The HTTP-headers cannot be too large (It failed with 100 fileNames).

@ -183,7 +186,7 @@ public class FileUtils {
                    logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
                    return null;
                }
-                String fileNameWithExtension = matcher.group(1);
+                String fileNameWithExtension = matcher.group(2);
                if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
                    logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
                    return null;
@ -382,13 +385,13 @@ public class FileUtils {
                    logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
                    continue;
                }
-                // The "matcher.group(2)" returns the "filenameWithoutExtension", which is currently not used.
-                String fileNameID = matcher.group(3);
+                // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
+                String fileNameID = matcher.group(4);
                if ( (fileNameID == null) || fileNameID.isEmpty() ) {
                    logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
                    continue;
                }
-                String dotFileExtension = matcher.group(4);
+                String dotFileExtension = matcher.group(5);
                if ( (dotFileExtension == null) || dotFileExtension.isEmpty() ) {
                    logger.error("Failed to extract the \"dotFileExtension\" from \"" + fileName + "\".");
                    continue;
@ -420,23 +423,10 @@ public class FileUtils {
                    continue;
                }

-                if ( datasourceId == null ) {
-                    logger.error("The retrieved \"datasourceId\" was \"null\" for file: " + fileName);
+                String s3Url = constructFileNameAndUploadToS3(targetDirectory, fileName, fileNameID, dotFileExtension, datasourceId, hash);
+                if ( s3Url == null )
                    continue;
-                }

-                if   ( hash == null ) {
-                    logger.error("The retrieved \"hash\" was \"null\" for file: " + fileName);
-                    continue;
-                }
-
-                String fileFullPath = targetDirectory + fileName;   // The fullPath to the local file.
-
-                // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
-                // Now we append the file-hash, so it is guaranteed that the filename will be unique.
-                fileName = datasourceId + "/" + fileNameID + "::" + hash + dotFileExtension;    // This is the fileName to be used in the objectStore, not of the local file!
-
-                String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
                setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
                //numUploadedFiles ++;
            } catch (Exception e) {
@ -450,6 +440,28 @@ public class FileUtils {
    }


+    public String constructFileNameAndUploadToS3(String fileDir, String fileName, String openAireID, String dotFileExtension, String datasourceId, String hash) throws Exception
+    {
+        if ( datasourceId == null ) {
+            logger.error("The retrieved \"datasourceId\" was \"null\" for file: " + fileName);
+            return null;
+        }
+
+        if ( hash == null ) {
+            logger.error("The retrieved \"hash\" was \"null\" for file: " + fileName);
+            return null;
+        }
+
+        String fileFullPath = fileDir + File.separator + fileName;   // The fullPath to the local file.
+
+        // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
+        // Now we append the file-hash, so it is guaranteed that the filename will be unique.
+        fileName = datasourceId + "/" + openAireID + "::" + hash + dotFileExtension;    // This is the fileName to be used in the objectStore, not of the local file!
+
+        return s3ObjectStore.uploadToS3(fileName, fileFullPath);
+    }
+
+
    public String getMessageFromResponseBody(HttpURLConnection conn, boolean isError) {
        final StringBuilder msgStrB = new StringBuilder(500);
        try ( BufferedReader br = new BufferedReader(new InputStreamReader((isError ? conn.getErrorStream() : conn.getInputStream()))) ) {  // Try-with-resources
@ -561,11 +573,11 @@ public class FileUtils {
    }


-    public static boolean deleteDirectory(File directory)
+    public boolean deleteDirectory(File directory)
    {
        try {
            org.apache.commons.io.FileUtils.deleteDirectory(directory);
-            return true;
+            return true;    // Will return "true" also in case this directory does not exist. So, no Exception will be thrown for that case.
        } catch (IOException e) {
            logger.error("The following directory could not be deleted: " + directory.getName(), e);
            return false;
@ -575,4 +587,35 @@ public class FileUtils {
        }
    }

+
+    public boolean deleteFile(String fileFullPathString)
+    {
+        try {
+            FileDeleteStrategy.FORCE.delete(new File(fileFullPathString));
+        } catch (IOException e) {
+            logger.error("Error when deleting the file: " + fileFullPathString);
+            return false;
+        }
+        return true;
+    }
+
+
+    Lock fileWriteLock = new ReentrantLock(true);
+
+    public String writeToFile(String fileFullPath, String stringToWrite)
+    {
+        fileWriteLock.lock();
+        try ( BufferedWriter bufferedWriter = new BufferedWriter(Files.newBufferedWriter(Paths.get(fileFullPath)), FileUtils.tenMb) )
+        {
+            bufferedWriter.write(stringToWrite); // This will overwrite the file. If the new string is smaller, then it does not matter.
+        } catch (Exception e) {
+            String errorMsg = "Failed to create or acquire the file \"" + fileFullPath + "\"!";
+            logger.error(errorMsg, e);
+            return errorMsg;
+        } finally {
+            fileWriteLock.unlock();
+        }
+        return null;
+    }
+
 }
--- a/src/main/java/eu/openaire/urls_controller/util/GenericUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/GenericUtils.java
@ -1,10 +1,33 @@
 package eu.openaire.urls_controller.util;

+import java.text.SimpleDateFormat;
+import java.util.Date;
+
 public class GenericUtils {

+    private static final SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss.SSS z");
+
+    public static String getReadableCurrentTimeAndZone() {
+        return (simpleDateFormat.format(new Date(System.currentTimeMillis())));
+    }
+

    public static int getRandomNumber(int min, int max) {
        return (int)(Math.random() * (max - min +1) + min);
    }

+
+    public static String getSelectiveStackTrace(Throwable thr, String initialMessage, int numOfLines)
+    {
+        StackTraceElement[] stels = thr.getStackTrace();
+        StringBuilder sb = new StringBuilder(numOfLines *100);
+        if ( initialMessage != null )
+            sb.append(initialMessage).append(" Stacktrace:").append("\n");	// This StringBuilder is thread-safe as a local-variable.
+        for ( int i = 0; (i < stels.length) && (i <= numOfLines); ++i ) {
+            sb.append(stels[i]);
+            if (i < numOfLines) sb.append("\n");
+        }
+        return sb.toString();
+    }
+
 }
--- a/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java
@ -59,18 +59,18 @@ public class ParquetFileUtils {
    private JdbcTemplate jdbcTemplate;

    @Value("${hdfs.baseUrl}")
-    private String webHDFSBaseUrl;
+    public String webHDFSBaseUrl;

    private final String hdfsHttpAuthString;

    private final String hdfsUserName;


-    private final String payloadSchemaFilePath = "schemas/payload.avsc";
+    public static final String payloadSchemaFilePath = "schemas/payload.avsc";

-    private final String attemptSchemaFilePath = "schemas/attempt.avsc";
+    private static final String attemptSchemaFilePath = "schemas/attempt.avsc";

-    public Schema payloadsSchema;
+    public static Schema payloadsSchema = null;
    public Schema attemptsSchema;

    public final String parquetHDFSDirectoryPathAttempts;
@ -79,12 +79,17 @@ public class ParquetFileUtils {
    public final String parquetHDFSDirectoryPathPayloadsAggregated;
    public final String parquetHDFSDirectoryPathPayloadsBulkImport;

+    public String mkDirsAndParams;
+
+    //public String setPermAndParams;
+

    public ParquetFileUtils(@Value("${hdfs.baseUrl}") String webHDFSBaseUrl,
                            @Value("${hdfs.httpAuth}") String hdfsHttpAuthString, @Value("${hdfs.userName}") String hdfsUserName, @Value("${hdfs.password}") String hdfsPassword, @Value("${services.pdfaggregation.controller.parquetLocalDirectoryPath}") String parquetBaseDirectoryPath,
                            @Value("${hdfs.parquetRemoteBaseDirectoryPath}") String hdfsParquetBaseDir,
                            @Value("${services.pdfaggregation.controller.isTestEnvironment}") boolean isTestEnvironment, FileUtils fileUtils) throws IOException
    {
+        this.mkDirsAndParams = "?op=MKDIRS&permission=777&user.name=" + hdfsUserName;
        if ( webHDFSBaseUrl.endsWith("/") ) // We don't wand an ending slash in the url (as it causes problems when the file=path is added).
            this.webHDFSBaseUrl = webHDFSBaseUrl.substring(0, (webHDFSBaseUrl.length() -1));
        else
@ -128,11 +133,13 @@ public class ParquetFileUtils {
        this.parquetHDFSDirectoryPathPayloadsAggregated = hdfsParquetBaseDir + "payloads_aggregated/";
        this.parquetHDFSDirectoryPathPayloadsBulkImport = hdfsParquetBaseDir + "payloads_bulk_import/";
        this.fileUtils = fileUtils;
+        this.mkDirsAndParams = "?op=MKDIRS&permission=777&user.name=" + hdfsUserName;   // All permissions for user, group and others must be set, in order for this service' user to have access to the hdfs directory.
+        //this.setPermAndParams = "?op=SETPERMISSION&permission=777&user.name=" + hdfsUserName;
        createRemoteParquetDirectories(hdfsParquetBaseDir);
    }


-    public Schema parseSchema(String schemaResourcePath) {
+    public static Schema parseSchema(String schemaResourcePath) {
        try {
            return (new Schema.Parser()).parse(new ClassPathResource(schemaResourcePath).getInputStream());
        } catch (Throwable e) {
@ -464,8 +471,6 @@ public class ParquetFileUtils {
        // The WebHDFS uses the "mkdirs" operations which creates all the non-existent directories in the specified path.
        // So with one request we will create the "parquet_uploads/" and the "parquet_uploads/attempts/" and with the seconds request, the "parquet_uploads/payloads/" directory.

-        String mkDirsParams = "?op=MKDIRS&permission=777&user.name=" + hdfsUserName;
-
        logger.info("Going to check if the remote parquet directories exist.");

        String listMainDirectoryUrl = webHDFSBaseUrl + parquetBaseRemoteDirectory + "?op=LISTSTATUS&user.name=" + hdfsUserName;
@ -492,9 +497,9 @@ public class ParquetFileUtils {

            if ( statusCode == 404 ) {
                logger.info("The directory \"" + parquetBaseRemoteDirectory + "\" does not exist. We will create it, along with its sub-directories.");
-                attemptCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathAttempts + mkDirsParams);
-                payloadAggregatedCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsAggregated + mkDirsParams);
-                payloadBulkImportCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsParams);
+                attemptCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathAttempts + mkDirsAndParams);
+                payloadAggregatedCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsAggregated + mkDirsAndParams);
+                payloadBulkImportCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsAndParams);
            }
            else {
                // Check the json-response, to see if all the subdirectories exist.
@ -549,19 +554,19 @@ public class ParquetFileUtils {
                // For each missing subdirectories, run the mkDirs-request.
                if ( !foundAttemptsDir ) {
                    logger.debug("The remote parquet directory \"" + parquetHDFSDirectoryPathAttempts + "\" does not exist! Going to create it.");
-                    attemptCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathAttempts + mkDirsParams);
+                    attemptCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathAttempts + mkDirsAndParams);
                } else
                    logger.info("The remote parquet directory \"" + parquetHDFSDirectoryPathAttempts + "\" exists.");

                if ( !foundPayloadsAggregatedDir ) {
                    logger.debug("The remote parquet directory \"" + parquetHDFSDirectoryPathPayloadsAggregated + "\" does not exist! Going to create it.");
-                    payloadAggregatedCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsAggregated + mkDirsParams);
+                    payloadAggregatedCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsAggregated + mkDirsAndParams);
                } else
                    logger.info("The remote parquet directory \"" + parquetHDFSDirectoryPathPayloadsAggregated + "\" exists.");

                if ( !foundPayloadsBulkImportDir ) {
                    logger.debug("The remote parquet directory \"" + parquetHDFSDirectoryPathPayloadsBulkImport + "\" does not exist! Going to create it.");
-                    payloadBulkImportCreationSuccessful = createHDFSDirectory(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsParams);
+                    payloadBulkImportCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsAndParams);
                } else
                    logger.info("The remote parquet directory \"" + parquetHDFSDirectoryPathPayloadsBulkImport + "\" exists.");
            }
@ -575,10 +580,10 @@ public class ParquetFileUtils {
    }


-    public boolean createHDFSDirectory(String createDirectoryUrl)
+    public boolean applyHDFOperation(String hdfsOperationUrl)
    {
        try {
-            URL url = new URL(createDirectoryUrl);
+            URL url = new URL(hdfsOperationUrl);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("PUT");
            conn.setRequestProperty("Authorization", hdfsHttpAuthString);
@ -586,15 +591,15 @@ public class ParquetFileUtils {
            conn.connect();
            int statusCode = conn.getResponseCode();
            if ( statusCode == -1 ) {
-                logger.error("Problem when getting the \"status-code\" for url: " + createDirectoryUrl);
+                logger.error("Problem when getting the \"status-code\" for url: " + hdfsOperationUrl);
                return false;
            }
            else if ( statusCode != 200 ) {
-                String errorMsg = "We expected a \"200 OK\" response, but got: \"" + statusCode + "\" instead, for url: " + createDirectoryUrl;
+                String errorMsg = "We expected a \"200 OK\" response, but got: \"" + statusCode + "\" instead, for url: " + hdfsOperationUrl;
                logger.error(errorMsg + "\n\n" + fileUtils.getMessageFromResponseBody(conn, true));
                return false;
            }
-            logger.trace("Creation was successful for hdfs-dir-url: " + createDirectoryUrl + "\n" + fileUtils.getMessageFromResponseBody(conn, false));
+            logger.trace("The Operation was successful for hdfs-op-url: " + hdfsOperationUrl + "\n" + fileUtils.getMessageFromResponseBody(conn, false));
        } catch (Exception e) {
            logger.error("", e);
            return false;
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@ -29,12 +29,22 @@ services:
                shouldEmptyBucket: false
                shouldShowAllS3Buckets: true

-            datasources:    # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
-                excludedIDs: >  # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists (at least without additional config).
-                    opendoar____::6f4922f45568161a8cdf4ad2299f6d23

-                # Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
-                # First-id: arXiv.org e-Print Archive
+bulkImport:
+    baseBulkImportLocation: /mnt/bulkImport/
+    bulkImportReportLocation: /bulkImportReports/
+    bulkImportSources:  # These sources are accepted for bulk-import requests and are excluded from crawling.
+        arxivImport:
+            datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
+            datasourcePrefix: arXiv_______  # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
+            pdfUrlPrefix: https://arxiv.org/pdf/
+            mimeType: application/pdf
+#        otherImport:
+#            datasourceID: othersource__::0123
+#            datasourcePrefix: other_______
+#            pdfUrlPrefix: https://example.org/pdf/
+#            mimeType: application/pdf
+

 spring:
    application: