forked from lsmyrnaios/UrlsController
171 lines
6.2 KiB
Java
171 lines
6.2 KiB
Java
|
package eu.openaire.urls_controller.util;
|
||
|
|
||
|
import com.google.common.collect.HashMultimap;
|
||
|
import eu.openaire.urls_controller.models.Task;
|
||
|
import org.slf4j.Logger;
|
||
|
import org.slf4j.LoggerFactory;
|
||
|
import org.springframework.boot.configurationprocessor.json.JSONException;
|
||
|
import org.springframework.boot.configurationprocessor.json.JSONObject;
|
||
|
|
||
|
import java.io.File;
|
||
|
import java.io.IOException;
|
||
|
import java.io.InputStream;
|
||
|
import java.nio.file.Files;
|
||
|
import java.nio.file.Paths;
|
||
|
import java.util.Scanner;
|
||
|
|
||
|
public class FileUtils {
|
||
|
|
||
|
private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);
|
||
|
|
||
|
private static Scanner inputScanner = null;
|
||
|
private static int fileIndex = 0;
|
||
|
private static int unretrievableInputLines = 0;
|
||
|
private static int duplicateIdUrlEntries = 0;
|
||
|
public static int jsonBatchSize = 3000;
|
||
|
private static final String utf8Charset = "UTF-8";
|
||
|
public static String inputFileFullPath;
|
||
|
private static String userDir = System.getProperty("user.dir") + File.separator;
|
||
|
|
||
|
public FileUtils()
|
||
|
{
|
||
|
inputFileFullPath = userDir + "src" + File.separator + "main" + File.separator + "resources";
|
||
|
String resourceFileName = "testInputFiles" + File.separator + "orderedList1000.json";
|
||
|
inputFileFullPath += File.separator + resourceFileName;
|
||
|
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourceFileName);
|
||
|
if ( inputStream == null ) {
|
||
|
logger.error("No resourceFile was found with name \"" + resourceFileName + "\".");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName);
|
||
|
|
||
|
FileUtils.inputScanner = new Scanner(inputStream, utf8Charset);
|
||
|
fileIndex = 0; // Re-initialize the file-number-pointer.
|
||
|
}
|
||
|
|
||
|
|
||
|
// This is currently not used, but it may be useful in a future scenario.
|
||
|
private static long getInputFileLinesNum()
|
||
|
{
|
||
|
long numOfLines = 0;
|
||
|
try {
|
||
|
numOfLines = Files.lines(Paths.get(inputFileFullPath)).count();
|
||
|
logger.debug("The numOfLines in the inputFile is " + numOfLines);
|
||
|
} catch (IOException e) {
|
||
|
logger.error("Could not retrieve the numOfLines. " + e);
|
||
|
return -1;
|
||
|
}
|
||
|
return numOfLines;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This method decodes a Jason String into its members.
|
||
|
* @param jsonLine String
|
||
|
* @return HashMap<String,String>
|
||
|
*/
|
||
|
public static Task jsonDecoder(String jsonLine)
|
||
|
{
|
||
|
// Get ID and url and put them in the HashMap
|
||
|
String idStr = null;
|
||
|
String urlStr = null;
|
||
|
try {
|
||
|
JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine.
|
||
|
idStr = jObj.get("id").toString();
|
||
|
urlStr = jObj.get("url").toString();
|
||
|
} catch (JSONException je) {
|
||
|
logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je);
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
if ( urlStr.isEmpty() ) {
|
||
|
if ( !idStr.isEmpty() ) // If we only have the id, then go and log it.
|
||
|
logger.warn("The url was not found for id: \"" + idStr + "\"");
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
return new Task(idStr, urlStr);
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This method parses a Json file and extracts the urls, along with the IDs.
|
||
|
* @return HashMultimap<String, String>
|
||
|
*/
|
||
|
public static HashMultimap<String, String> getNextIdUrlPairBatchFromJson()
|
||
|
{
|
||
|
Task inputIdUrlTuple;
|
||
|
int expectedPathsPerID = 5;
|
||
|
int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID;
|
||
|
|
||
|
HashMultimap<String, String> idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID);
|
||
|
|
||
|
int curBeginning = fileIndex;
|
||
|
|
||
|
while ( inputScanner.hasNextLine() && (fileIndex < (curBeginning + jsonBatchSize)) )
|
||
|
{// While (!EOF) and inside the current url-batch, iterate through lines.
|
||
|
|
||
|
//logger.debug("fileIndex: " + FileUtils.fileIndex); // DEBUG!
|
||
|
|
||
|
// Take each line, remove potential double quotes.
|
||
|
String retrievedLineStr = inputScanner.nextLine();
|
||
|
//logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG!
|
||
|
|
||
|
fileIndex ++;
|
||
|
|
||
|
if ( retrievedLineStr.isEmpty() ) {
|
||
|
unretrievableInputLines ++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes.
|
||
|
logger.warn("A problematic inputLine found: \t" + retrievedLineStr);
|
||
|
unretrievableInputLines ++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl.
|
||
|
duplicateIdUrlEntries ++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return idAndUrlMappedInput;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This method returns the number of (non-heading, non-empty) lines we have read from the inputFile.
|
||
|
* @return loadedUrls
|
||
|
*/
|
||
|
public static int getCurrentlyLoadedUrls() // In the end, it gives the total number of urls we have processed.
|
||
|
{
|
||
|
return FileUtils.fileIndex - FileUtils.unretrievableInputLines;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This method checks if there is no more input-data and returns true in that case.
|
||
|
* Otherwise, it returns false, if there is more input-data to be loaded.
|
||
|
* A "RuntimeException" is thrown if no input-urls were retrieved in general.
|
||
|
* @param isEmptyOfData
|
||
|
* @param isFirstRun
|
||
|
* @return finished loading / not finished
|
||
|
* @throws RuntimeException
|
||
|
*/
|
||
|
public static boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun)
|
||
|
{
|
||
|
if ( isEmptyOfData ) {
|
||
|
if ( isFirstRun )
|
||
|
logger.error("Could not retrieve any urls from the inputFile!");
|
||
|
else
|
||
|
logger.debug("Done loading " + FileUtils.getCurrentlyLoadedUrls() + " urls from the inputFile.");
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
}
|