package eu.openaire.urls_controller.util; import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.models.Task; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; import java.io.IOException; import java.io.InputStream; import java.util.Scanner; @Component public class TestFileUtils { private static final Logger logger = LoggerFactory.getLogger(TestFileUtils.class); public Resource testResource = new ClassPathResource("testInputFiles/orderedList1000.json"); public ThreadLocal duplicateIdUrlEntries; public ThreadLocal inputScanner; // On each request, a new thread is spawned to handle it. So, a new inputScanner is needed. private final int jsonBatchSize = 3000; private ThreadLocal fileIndex; private ThreadLocal unretrievableInputLines; private final String utf8Charset = "UTF-8"; public TestFileUtils() throws IOException { InputStream inputStream = testResource.getInputStream(); inputScanner = ThreadLocal.withInitial(() -> new Scanner(inputStream, utf8Charset)); fileIndex = ThreadLocal.withInitial(() -> 0); unretrievableInputLines = ThreadLocal.withInitial(() -> 0); duplicateIdUrlEntries = ThreadLocal.withInitial(() -> 0); } /** * This method parses a Json file and extracts the urls, along with the IDs. * @return HashMultimap */ public HashMultimap getNextIdUrlPairBatchFromJson() { Task inputIdUrlTuple; int expectedPathsPerID = 5; int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); int curBeginning = fileIndex.get(); while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) ) {// While (!EOF) and inside the current url-batch, iterate through lines. //logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG! // Take each line, remove potential double quotes. String retrievedLineStr = inputScanner.get().nextLine(); //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! fileIndex.set(fileIndex.get() +1); if ( retrievedLineStr.isEmpty() ) { unretrievableInputLines.set(unretrievableInputLines.get() +1); continue; } if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes. logger.warn("A problematic inputLine found: \t" + retrievedLineStr); unretrievableInputLines.set(unretrievableInputLines.get() +1); continue; } if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl. duplicateIdUrlEntries.set(duplicateIdUrlEntries.get() +1); } } return idAndUrlMappedInput; } /** * This method decodes a Json String and returns its members. * @param jsonLine String * @return HashMap */ private Task jsonDecoder(String jsonLine) { // Get ID and url and put them in the HashMap String idStr = null; String urlStr = null; try { JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine. idStr = jObj.get("id").toString(); urlStr = jObj.get("url").toString(); } catch (JSONException je) { logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je); return null; } if ( urlStr.isEmpty() ) { if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. logger.warn("The url was not found for id: \"" + idStr + "\""); return null; } return new Task(idStr, urlStr, null); } /** * This method checks if there is no more input-data and returns true in that case. * Otherwise, it returns false, if there is more input-data to be loaded. * A "RuntimeException" is thrown if no input-urls were retrieved in general. * @param isEmptyOfData * @param isFirstRun * @return finished loading / not finished */ public boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) { if ( isEmptyOfData ) { if ( isFirstRun ) logger.error("Could not retrieve any urls from the inputFile!"); else logger.debug("Done loading " + getCurrentlyLoadedUrls() + " urls from the inputFile."); return true; } return false; } /** * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. * @return loadedUrls */ private int getCurrentlyLoadedUrls() { // In the end, it gives the total number of urls we have processed. return fileIndex.get() - unretrievableInputLines.get(); } }