2022-01-30 21:14:52 +01:00
package eu.openaire.urls_controller.util ;
import com.google.common.collect.HashMultimap ;
import eu.openaire.urls_controller.models.Task ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.springframework.boot.configurationprocessor.json.JSONException ;
import org.springframework.boot.configurationprocessor.json.JSONObject ;
2022-02-01 15:57:28 +01:00
import org.springframework.core.io.ClassPathResource ;
2022-02-02 19:19:46 +01:00
import org.springframework.core.io.Resource ;
2022-01-30 21:14:52 +01:00
import org.springframework.stereotype.Component ;
import java.io.IOException ;
import java.io.InputStream ;
import java.util.Scanner ;
@Component
public class TestFileUtils {
private static final Logger logger = LoggerFactory . getLogger ( TestFileUtils . class ) ;
2022-02-02 19:19:46 +01:00
public Resource testResource = new ClassPathResource ( " testInputFiles/orderedList1000.json " ) ;
2022-01-30 21:14:52 +01:00
2022-02-02 19:19:46 +01:00
public ThreadLocal < Integer > duplicateIdUrlEntries ;
2022-05-26 14:43:59 +02:00
public ThreadLocal < Scanner > inputScanner ; // On each request, a new thread is spawned to handle it. So, a new inputScanner is needed.
2022-01-30 21:14:52 +01:00
private final int jsonBatchSize = 3000 ;
2022-02-02 19:19:46 +01:00
private ThreadLocal < Integer > fileIndex ;
private ThreadLocal < Integer > unretrievableInputLines ;
2022-01-30 21:14:52 +01:00
private final String utf8Charset = " UTF-8 " ;
2022-02-02 19:19:46 +01:00
public TestFileUtils ( ) throws IOException {
2022-01-30 21:14:52 +01:00
InputStream inputStream = testResource . getInputStream ( ) ;
2022-02-02 19:19:46 +01:00
inputScanner = ThreadLocal . withInitial ( ( ) - > new Scanner ( inputStream , utf8Charset ) ) ;
fileIndex = ThreadLocal . withInitial ( ( ) - > 0 ) ;
unretrievableInputLines = ThreadLocal . withInitial ( ( ) - > 0 ) ;
duplicateIdUrlEntries = ThreadLocal . withInitial ( ( ) - > 0 ) ;
2022-01-30 21:14:52 +01:00
}
2022-02-02 19:19:46 +01:00
2022-01-30 21:14:52 +01:00
/ * *
* This method parses a Json file and extracts the urls , along with the IDs .
* @return HashMultimap < String , String >
* /
public HashMultimap < String , String > getNextIdUrlPairBatchFromJson ( ) {
Task inputIdUrlTuple ;
int expectedPathsPerID = 5 ;
int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID ;
HashMultimap < String , String > idAndUrlMappedInput = HashMultimap . create ( expectedIDsPerBatch , expectedPathsPerID ) ;
int curBeginning = fileIndex . get ( ) ;
while ( inputScanner . get ( ) . hasNextLine ( ) & & ( fileIndex . get ( ) < ( curBeginning + jsonBatchSize ) ) )
{ // While (!EOF) and inside the current url-batch, iterate through lines.
//logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG!
// Take each line, remove potential double quotes.
String retrievedLineStr = inputScanner . get ( ) . nextLine ( ) ;
//logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG!
fileIndex . set ( fileIndex . get ( ) + 1 ) ;
if ( retrievedLineStr . isEmpty ( ) ) {
unretrievableInputLines . set ( unretrievableInputLines . get ( ) + 1 ) ;
continue ;
}
if ( ( inputIdUrlTuple = jsonDecoder ( retrievedLineStr ) ) = = null ) { // Decode the jsonLine and take the two attributes.
logger . warn ( " A problematic inputLine found: \ t " + retrievedLineStr ) ;
unretrievableInputLines . set ( unretrievableInputLines . get ( ) + 1 ) ;
continue ;
}
if ( ! idAndUrlMappedInput . put ( inputIdUrlTuple . getId ( ) , inputIdUrlTuple . getUrl ( ) ) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl.
duplicateIdUrlEntries . set ( duplicateIdUrlEntries . get ( ) + 1 ) ;
}
}
return idAndUrlMappedInput ;
}
2022-02-02 19:19:46 +01:00
2022-01-30 21:14:52 +01:00
/ * *
* This method decodes a Json String and returns its members .
* @param jsonLine String
* @return HashMap < String , String >
* /
private Task jsonDecoder ( String jsonLine ) {
// Get ID and url and put them in the HashMap
String idStr = null ;
String urlStr = null ;
try {
JSONObject jObj = new JSONObject ( jsonLine ) ; // Construct a JSONObject from the retrieved jsonLine.
idStr = jObj . get ( " id " ) . toString ( ) ;
urlStr = jObj . get ( " url " ) . toString ( ) ;
} catch ( JSONException je ) {
logger . warn ( " JSONException caught when tried to parse and extract values from jsonLine: \ t " + jsonLine , je ) ;
return null ;
}
if ( urlStr . isEmpty ( ) ) {
if ( ! idStr . isEmpty ( ) ) // If we only have the id, then go and log it.
logger . warn ( " The url was not found for id: \" " + idStr + " \" " ) ;
return null ;
}
return new Task ( idStr , urlStr , null ) ;
}
2022-02-02 19:19:46 +01:00
2022-01-30 21:14:52 +01:00
/ * *
* This method checks if there is no more input - data and returns true in that case .
* Otherwise , it returns false , if there is more input - data to be loaded .
* A " RuntimeException " is thrown if no input - urls were retrieved in general .
* @param isEmptyOfData
* @param isFirstRun
* @return finished loading / not finished
* /
public boolean isFinishedLoading ( boolean isEmptyOfData , boolean isFirstRun ) {
if ( isEmptyOfData ) {
if ( isFirstRun )
logger . error ( " Could not retrieve any urls from the inputFile! " ) ;
else
logger . debug ( " Done loading " + getCurrentlyLoadedUrls ( ) + " urls from the inputFile. " ) ;
return true ;
}
return false ;
}
2022-02-02 19:19:46 +01:00
2022-01-30 21:14:52 +01:00
/ * *
* This method returns the number of ( non - heading , non - empty ) lines we have read from the inputFile .
* @return loadedUrls
* /
private int getCurrentlyLoadedUrls ( ) { // In the end, it gives the total number of urls we have processed.
return fileIndex . get ( ) - unretrievableInputLines . get ( ) ;
}
2022-02-02 19:19:46 +01:00
2022-01-30 21:14:52 +01:00
}