UrlsController/src/main/java/eu/openaire/urls_controller/util/FileUtils.java

package eu.openaire.urls_controller.util;

import com.google.common.collect.HashMultimap;
import eu.openaire.urls_controller.models.Task;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.configurationprocessor.json.JSONException;
import org.springframework.boot.configurationprocessor.json.JSONObject;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;

public class FileUtils {

    private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);

    public static Scanner inputScanner = null;
    private static int fileIndex = 0;
    private static int unretrievableInputLines = 0;
    public static int duplicateIdUrlEntries = 0;
    public static int jsonBatchSize = 3000;
    private static final String utf8Charset = "UTF-8";
    public static String inputFileFullPath;
    private static String workingDir = System.getProperty("user.dir") + File.separator;


    public FileUtils() throws RuntimeException
    {
        inputFileFullPath = workingDir + "src" + File.separator + "main" + File.separator + "resources";
        String resourceFileName = "testInputFiles" + File.separator + "orderedList1000.json";
        inputFileFullPath += File.separator + resourceFileName;
        InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourceFileName);
        if ( inputStream == null )
            throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\".");

        logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName);

        FileUtils.inputScanner = new Scanner(inputStream, utf8Charset);
        fileIndex = 0;  // Re-initialize the file-number-pointer.
    }


    // This is currently not used, but it may be useful in a future scenario.
    private static long getInputFileLinesNum()
    {
        long numOfLines = 0;
        try {
            numOfLines = Files.lines(Paths.get(inputFileFullPath)).count();
            logger.debug("The numOfLines in the inputFile is " + numOfLines);
        } catch (IOException e) {
            logger.error("Could not retrieve the numOfLines. " + e);
            return -1;
        }
        return numOfLines;
    }


    /**
     * This method decodes a Jason String into its members.
     * @param jsonLine String
     * @return HashMap<String,String>
     */
    public static Task jsonDecoder(String jsonLine)
    {
        // Get ID and url and put them in the HashMap
        String idStr = null;
        String urlStr = null;
        try {
            JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine.
            idStr = jObj.get("id").toString();
            urlStr = jObj.get("url").toString();
        } catch (JSONException je) {
            logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je);
            return null;
        }

        if ( urlStr.isEmpty() ) {
            if ( !idStr.isEmpty() )	// If we only have the id, then go and log it.
                logger.warn("The url was not found for id: \"" + idStr + "\"");
            return null;
        }

        return new Task(idStr, urlStr, null);
    }


    /**
     * This method parses a Json file and extracts the urls, along with the IDs.
     * @return HashMultimap<String, String>
     */
    public static HashMultimap<String, String> getNextIdUrlPairBatchFromJson()
    {
        Task inputIdUrlTuple;
        int expectedPathsPerID = 5;
        int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID;

        HashMultimap<String, String> idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID);

        int curBeginning = fileIndex;

        while ( inputScanner.hasNextLine() && (fileIndex < (curBeginning + jsonBatchSize)) )
        {// While (!EOF) and inside the current url-batch, iterate through lines.

            //logger.debug("fileIndex: " + FileUtils.fileIndex);	// DEBUG!

            // Take each line, remove potential double quotes.
            String retrievedLineStr = inputScanner.nextLine();
            //logger.debug("Loaded from inputFile: " + retrievedLineStr);	// DEBUG!

            fileIndex ++;

            if ( retrievedLineStr.isEmpty() ) {
                unretrievableInputLines ++;
                continue;
            }

            if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) {	// Decode the jsonLine and take the two attributes.
                logger.warn("A problematic inputLine found: \t" + retrievedLineStr);
                unretrievableInputLines ++;
                continue;
            }

            if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) {    // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl.
                duplicateIdUrlEntries ++;
            }
        }

        return idAndUrlMappedInput;
    }


    /**
     * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile.
     * @return loadedUrls
     */
    public static int getCurrentlyLoadedUrls()	// In the end, it gives the total number of urls we have processed.
    {
        return FileUtils.fileIndex - FileUtils.unretrievableInputLines;
    }


    /**
     * This method checks if there is no more input-data and returns true in that case.
     * Otherwise, it returns false, if there is more input-data to be loaded.
     * A "RuntimeException" is thrown if no input-urls were retrieved in general.
     * @param isEmptyOfData
     * @param isFirstRun
     * @return finished loading / not finished
     * @throws RuntimeException
     */
    public static boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun)
    {
        if ( isEmptyOfData ) {
            if ( isFirstRun )
                logger.error("Could not retrieve any urls from the inputFile!");
            else
                logger.debug("Done loading " + FileUtils.getCurrentlyLoadedUrls() + " urls from the inputFile.");
            return true;
        }
        return false;
    }


}