UrlsController/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java

142 lines
5.6 KiB
Java

package eu.openaire.urls_controller.util;
import com.google.common.collect.HashMultimap;
import eu.openaire.urls_controller.models.Task;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.configurationprocessor.json.JSONException;
import org.springframework.boot.configurationprocessor.json.JSONObject;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;
@Component
public class TestFileUtils {
private static final Logger logger = LoggerFactory.getLogger(TestFileUtils.class);
public Resource testResource = new ClassPathResource("testInputFiles/orderedList1000.json");
public ThreadLocal<Integer> duplicateIdUrlEntries;
public ThreadLocal<Scanner> inputScanner; // On each request, a new thread is spawned to handle it. So, a new inputScanner is needed.
private final int jsonBatchSize = 3000;
private ThreadLocal<Integer> fileIndex;
private ThreadLocal<Integer> unretrievableInputLines;
private final String utf8Charset = "UTF-8";
public TestFileUtils() throws IOException {
InputStream inputStream = testResource.getInputStream();
inputScanner = ThreadLocal.withInitial(() -> new Scanner(inputStream, utf8Charset));
fileIndex = ThreadLocal.withInitial(() -> 0);
unretrievableInputLines = ThreadLocal.withInitial(() -> 0);
duplicateIdUrlEntries = ThreadLocal.withInitial(() -> 0);
}
/**
* This method parses a Json file and extracts the urls, along with the IDs.
* @return HashMultimap<String, String>
*/
public HashMultimap<String, String> getNextIdUrlPairBatchFromJson() {
Task inputIdUrlTuple;
int expectedPathsPerID = 5;
int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID;
HashMultimap<String, String> idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID);
int curBeginning = fileIndex.get();
while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) )
{// While (!EOF) and inside the current url-batch, iterate through lines.
//logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG!
// Take each line, remove potential double quotes.
String retrievedLineStr = inputScanner.get().nextLine();
//logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG!
fileIndex.set(fileIndex.get() +1);
if ( retrievedLineStr.isEmpty() ) {
unretrievableInputLines.set(unretrievableInputLines.get() +1);
continue;
}
if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes.
logger.warn("A problematic inputLine found: \t" + retrievedLineStr);
unretrievableInputLines.set(unretrievableInputLines.get() +1);
continue;
}
if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl.
duplicateIdUrlEntries.set(duplicateIdUrlEntries.get() +1);
}
}
return idAndUrlMappedInput;
}
/**
* This method decodes a Json String and returns its members.
* @param jsonLine String
* @return HashMap<String,String>
*/
private Task jsonDecoder(String jsonLine) {
// Get ID and url and put them in the HashMap
String idStr = null;
String urlStr = null;
try {
JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine.
idStr = jObj.get("id").toString();
urlStr = jObj.get("url").toString();
} catch (JSONException je) {
logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je);
return null;
}
if ( urlStr.isEmpty() ) {
if ( !idStr.isEmpty() ) // If we only have the id, then go and log it.
logger.warn("The url was not found for id: \"" + idStr + "\"");
return null;
}
return new Task(idStr, urlStr, null);
}
/**
* This method checks if there is no more input-data and returns true in that case.
* Otherwise, it returns false, if there is more input-data to be loaded.
* A "RuntimeException" is thrown if no input-urls were retrieved in general.
* @param isEmptyOfData
* @param isFirstRun
* @return finished loading / not finished
*/
public boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) {
if ( isEmptyOfData ) {
if ( isFirstRun )
logger.error("Could not retrieve any urls from the inputFile!");
else
logger.debug("Done loading " + getCurrentlyLoadedUrls() + " urls from the inputFile.");
return true;
}
return false;
}
/**
* This method returns the number of (non-heading, non-empty) lines we have read from the inputFile.
* @return loadedUrls
*/
private int getCurrentlyLoadedUrls() { // In the end, it gives the total number of urls we have processed.
return fileIndex.get() - unretrievableInputLines.get();
}
}