2021-05-18 16:23:20 +02:00
package eu.openaire.urls_controller.util ;
import com.google.common.collect.HashMultimap ;
import eu.openaire.urls_controller.models.Task ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.springframework.boot.configurationprocessor.json.JSONException ;
import org.springframework.boot.configurationprocessor.json.JSONObject ;
import java.io.File ;
import java.io.IOException ;
import java.io.InputStream ;
import java.nio.file.Files ;
import java.nio.file.Paths ;
import java.util.Scanner ;
public class FileUtils {
private static final Logger logger = LoggerFactory . getLogger ( FileUtils . class ) ;
2021-06-10 19:24:51 +02:00
public static ThreadLocal < Scanner > inputScanner = new ThreadLocal < Scanner > ( ) ; // Every Thread has its own variable.
private static ThreadLocal < Integer > fileIndex = new ThreadLocal < Integer > ( ) ;
private static ThreadLocal < Integer > unretrievableInputLines = new ThreadLocal < Integer > ( ) ;
public static ThreadLocal < Integer > duplicateIdUrlEntries = new ThreadLocal < Integer > ( ) ;
public static final int jsonBatchSize = 3000 ;
2021-05-18 16:23:20 +02:00
private static final String utf8Charset = " UTF-8 " ;
public static String inputFileFullPath ;
2021-06-10 19:24:51 +02:00
private static final String workingDir = System . getProperty ( " user.dir " ) + File . separator ;
2021-05-18 16:23:20 +02:00
2021-06-10 13:21:39 +02:00
public FileUtils ( ) throws RuntimeException
2021-05-18 16:23:20 +02:00
{
2021-06-10 13:21:39 +02:00
inputFileFullPath = workingDir + " src " + File . separator + " main " + File . separator + " resources " ;
2021-05-18 16:23:20 +02:00
String resourceFileName = " testInputFiles " + File . separator + " orderedList1000.json " ;
inputFileFullPath + = File . separator + resourceFileName ;
InputStream inputStream = getClass ( ) . getClassLoader ( ) . getResourceAsStream ( resourceFileName ) ;
2021-06-10 13:21:39 +02:00
if ( inputStream = = null )
throw new RuntimeException ( " No resourceFile was found with name \" " + resourceFileName + " \" . " ) ;
2021-05-18 16:23:20 +02:00
logger . debug ( " Going to retrieve the data from the inputResourceFile: " + resourceFileName ) ;
2021-06-10 19:24:51 +02:00
FileUtils . inputScanner . set ( new Scanner ( inputStream , utf8Charset ) ) ;
fileIndex . set ( 0 ) ; // Re-initialize the file-number-pointer.
unretrievableInputLines . set ( 0 ) ;
duplicateIdUrlEntries . set ( 0 ) ;
2021-05-18 16:23:20 +02:00
}
// This is currently not used, but it may be useful in a future scenario.
private static long getInputFileLinesNum ( )
{
long numOfLines = 0 ;
try {
numOfLines = Files . lines ( Paths . get ( inputFileFullPath ) ) . count ( ) ;
logger . debug ( " The numOfLines in the inputFile is " + numOfLines ) ;
} catch ( IOException e ) {
logger . error ( " Could not retrieve the numOfLines. " + e ) ;
return - 1 ;
}
return numOfLines ;
}
/ * *
* This method decodes a Jason String into its members .
* @param jsonLine String
* @return HashMap < String , String >
* /
public static Task jsonDecoder ( String jsonLine )
{
// Get ID and url and put them in the HashMap
String idStr = null ;
String urlStr = null ;
try {
JSONObject jObj = new JSONObject ( jsonLine ) ; // Construct a JSONObject from the retrieved jsonLine.
idStr = jObj . get ( " id " ) . toString ( ) ;
urlStr = jObj . get ( " url " ) . toString ( ) ;
} catch ( JSONException je ) {
logger . warn ( " JSONException caught when tried to parse and extract values from jsonLine: \ t " + jsonLine , je ) ;
return null ;
}
if ( urlStr . isEmpty ( ) ) {
if ( ! idStr . isEmpty ( ) ) // If we only have the id, then go and log it.
logger . warn ( " The url was not found for id: \" " + idStr + " \" " ) ;
return null ;
}
2021-05-20 01:50:50 +02:00
return new Task ( idStr , urlStr , null ) ;
2021-05-18 16:23:20 +02:00
}
/ * *
* This method parses a Json file and extracts the urls , along with the IDs .
* @return HashMultimap < String , String >
* /
public static HashMultimap < String , String > getNextIdUrlPairBatchFromJson ( )
{
Task inputIdUrlTuple ;
int expectedPathsPerID = 5 ;
int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID ;
HashMultimap < String , String > idAndUrlMappedInput = HashMultimap . create ( expectedIDsPerBatch , expectedPathsPerID ) ;
2021-06-10 19:24:51 +02:00
int curBeginning = fileIndex . get ( ) ;
2021-05-18 16:23:20 +02:00
2021-06-10 19:24:51 +02:00
while ( inputScanner . get ( ) . hasNextLine ( ) & & ( fileIndex . get ( ) < ( curBeginning + jsonBatchSize ) ) )
2021-05-18 16:23:20 +02:00
{ // While (!EOF) and inside the current url-batch, iterate through lines.
2021-06-10 19:24:51 +02:00
//logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG!
2021-05-18 16:23:20 +02:00
// Take each line, remove potential double quotes.
2021-06-10 19:24:51 +02:00
String retrievedLineStr = inputScanner . get ( ) . nextLine ( ) ;
2021-05-18 16:23:20 +02:00
//logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG!
2021-06-10 19:24:51 +02:00
fileIndex . set ( fileIndex . get ( ) + 1 ) ;
2021-05-18 16:23:20 +02:00
if ( retrievedLineStr . isEmpty ( ) ) {
2021-06-10 19:24:51 +02:00
unretrievableInputLines . set ( unretrievableInputLines . get ( ) + 1 ) ;
2021-05-18 16:23:20 +02:00
continue ;
}
if ( ( inputIdUrlTuple = jsonDecoder ( retrievedLineStr ) ) = = null ) { // Decode the jsonLine and take the two attributes.
logger . warn ( " A problematic inputLine found: \ t " + retrievedLineStr ) ;
2021-06-10 19:24:51 +02:00
unretrievableInputLines . set ( unretrievableInputLines . get ( ) + 1 ) ;
2021-05-18 16:23:20 +02:00
continue ;
}
if ( ! idAndUrlMappedInput . put ( inputIdUrlTuple . getId ( ) , inputIdUrlTuple . getUrl ( ) ) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl.
2021-06-10 19:24:51 +02:00
duplicateIdUrlEntries . set ( duplicateIdUrlEntries . get ( ) + 1 ) ;
2021-05-18 16:23:20 +02:00
}
}
return idAndUrlMappedInput ;
}
/ * *
* This method returns the number of ( non - heading , non - empty ) lines we have read from the inputFile .
* @return loadedUrls
* /
public static int getCurrentlyLoadedUrls ( ) // In the end, it gives the total number of urls we have processed.
{
2021-06-10 19:24:51 +02:00
return FileUtils . fileIndex . get ( ) - FileUtils . unretrievableInputLines . get ( ) ;
2021-05-18 16:23:20 +02:00
}
/ * *
* This method checks if there is no more input - data and returns true in that case .
* Otherwise , it returns false , if there is more input - data to be loaded .
* A " RuntimeException " is thrown if no input - urls were retrieved in general .
* @param isEmptyOfData
* @param isFirstRun
* @return finished loading / not finished
* @throws RuntimeException
* /
public static boolean isFinishedLoading ( boolean isEmptyOfData , boolean isFirstRun )
{
if ( isEmptyOfData ) {
if ( isFirstRun )
logger . error ( " Could not retrieve any urls from the inputFile! " ) ;
else
logger . debug ( " Done loading " + FileUtils . getCurrentlyLoadedUrls ( ) + " urls from the inputFile. " ) ;
return true ;
}
return false ;
}
}