package eu.openaire.urls_controller.util; import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.models.Task; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Scanner; public class FileUtils { private static final Logger logger = LoggerFactory.getLogger(FileUtils.class); public static Scanner inputScanner = null; private static int fileIndex = 0; private static int unretrievableInputLines = 0; public static int duplicateIdUrlEntries = 0; public static int jsonBatchSize = 3000; private static final String utf8Charset = "UTF-8"; public static String inputFileFullPath; private static String workingDir = System.getProperty("user.dir") + File.separator; public FileUtils() throws RuntimeException { inputFileFullPath = workingDir + "src" + File.separator + "main" + File.separator + "resources"; String resourceFileName = "testInputFiles" + File.separator + "orderedList1000.json"; inputFileFullPath += File.separator + resourceFileName; InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourceFileName); if ( inputStream == null ) throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\"."); logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName); FileUtils.inputScanner = new Scanner(inputStream, utf8Charset); fileIndex = 0; // Re-initialize the file-number-pointer. } // This is currently not used, but it may be useful in a future scenario. private static long getInputFileLinesNum() { long numOfLines = 0; try { numOfLines = Files.lines(Paths.get(inputFileFullPath)).count(); logger.debug("The numOfLines in the inputFile is " + numOfLines); } catch (IOException e) { logger.error("Could not retrieve the numOfLines. " + e); return -1; } return numOfLines; } /** * This method decodes a Jason String into its members. * @param jsonLine String * @return HashMap */ public static Task jsonDecoder(String jsonLine) { // Get ID and url and put them in the HashMap String idStr = null; String urlStr = null; try { JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine. idStr = jObj.get("id").toString(); urlStr = jObj.get("url").toString(); } catch (JSONException je) { logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je); return null; } if ( urlStr.isEmpty() ) { if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. logger.warn("The url was not found for id: \"" + idStr + "\""); return null; } return new Task(idStr, urlStr, null); } /** * This method parses a Json file and extracts the urls, along with the IDs. * @return HashMultimap */ public static HashMultimap getNextIdUrlPairBatchFromJson() { Task inputIdUrlTuple; int expectedPathsPerID = 5; int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); int curBeginning = fileIndex; while ( inputScanner.hasNextLine() && (fileIndex < (curBeginning + jsonBatchSize)) ) {// While (!EOF) and inside the current url-batch, iterate through lines. //logger.debug("fileIndex: " + FileUtils.fileIndex); // DEBUG! // Take each line, remove potential double quotes. String retrievedLineStr = inputScanner.nextLine(); //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! fileIndex ++; if ( retrievedLineStr.isEmpty() ) { unretrievableInputLines ++; continue; } if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes. logger.warn("A problematic inputLine found: \t" + retrievedLineStr); unretrievableInputLines ++; continue; } if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl. duplicateIdUrlEntries ++; } } return idAndUrlMappedInput; } /** * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. * @return loadedUrls */ public static int getCurrentlyLoadedUrls() // In the end, it gives the total number of urls we have processed. { return FileUtils.fileIndex - FileUtils.unretrievableInputLines; } /** * This method checks if there is no more input-data and returns true in that case. * Otherwise, it returns false, if there is more input-data to be loaded. * A "RuntimeException" is thrown if no input-urls were retrieved in general. * @param isEmptyOfData * @param isFirstRun * @return finished loading / not finished * @throws RuntimeException */ public static boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) { if ( isEmptyOfData ) { if ( isFirstRun ) logger.error("Could not retrieve any urls from the inputFile!"); else logger.debug("Done loading " + FileUtils.getCurrentlyLoadedUrls() + " urls from the inputFile."); return true; } return false; } }