package eu.openaire.urls_controller.util; import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.configuration.ImpalaConnector; import eu.openaire.urls_controller.models.Payload; import eu.openaire.urls_controller.models.Task; import eu.openaire.urls_controller.models.UrlReport; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; import javax.servlet.http.HttpServletRequest; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class FileUtils { private static final Logger logger = LoggerFactory.getLogger(FileUtils.class); public static ThreadLocal inputScanner = new ThreadLocal(); // Every Thread has its own variable. private static final ThreadLocal fileIndex = new ThreadLocal(); private static final ThreadLocal unretrievableInputLines = new ThreadLocal(); public static ThreadLocal duplicateIdUrlEntries = new ThreadLocal(); public static final int jsonBatchSize = 3000; private static final String utf8Charset = "UTF-8"; public static String inputFileFullPath; private static final String workingDir = System.getProperty("user.dir") + File.separator; public FileUtils() throws RuntimeException { inputFileFullPath = workingDir + "src" + File.separator + "main" + File.separator + "resources"; String resourceFileName = "testInputFiles" + File.separator + "orderedList1000.json"; inputFileFullPath += File.separator + resourceFileName; InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourceFileName); if ( inputStream == null ) throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\"."); logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName); FileUtils.inputScanner.set(new Scanner(inputStream, utf8Charset)); fileIndex.set(0); // Re-initialize the file-number-pointer. unretrievableInputLines.set(0); duplicateIdUrlEntries.set(0); } /** * In each insertion, a new parquet-file is created, so we end up with millions of files. Parquet is great for fast-select, so have to stick with it and merge those files.. * This method, creates a clone of the original table in order to have only one parquet file in the end. Drops the original table. * Renames the clone to the original's name. * Returns the errorMsg, if an error appears, otherwise is returns "null". * */ public static String mergeParquetFiles(String tableName, Connection con, String whereClause, String parameter) { String errorMsg; if ( tableName == null ) { errorMsg = "No tableName was given. Do not know the tableName for which we should merger the underlying files for!"; logger.error(errorMsg); return errorMsg; } // Make sure the following are empty strings (in case another method call this one in the future with a null-value). if ( whereClause == null ) whereClause = ""; if ( parameter == null ) parameter = ""; else parameter = " '" + parameter + "'"; // This will be a "string-check". Statement statement; try { statement = con.createStatement(); } catch (SQLException sqle) { errorMsg = "Problem when creating a connection-statement!\n"; logger.error(errorMsg + sqle.getMessage()); return errorMsg; } try { statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName + " " + whereClause + parameter); statement.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE"); statement.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName); statement.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName); } catch (SQLException sqle) { errorMsg = "Problem when executing the \"clone-drop-rename\" queries!\n"; logger.error(errorMsg + getCutBatchExceptionMessage(sqle.getMessage()), sqle); return errorMsg; } finally { // Make sure we close the statement. try { statement.close(); } catch (SQLException sqle3) { logger.error("Could not close the statement for executing queries in the Impala-database.\n" + sqle3); } } return null; // No errorMsg, everything is fine. } public enum UploadFullTextsResponse {successful, unsuccessful, databaseError}; private static final Pattern FILENAME_ID = Pattern.compile("([\\w_:]+)\\.[\\w]{2,10}$"); private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:]+\\.[\\w]{2,10})$"); public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator; private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). public static UploadFullTextsResponse getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { // The Controller have to request the files from the Worker, in order to upload them to the S3. // We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database. if ( request == null ) { logger.error("The \"HttpServletRequest\" is null!"); return UploadFullTextsResponse.unsuccessful; } String remoteAddr = request.getHeader("X-FORWARDED-FOR"); if ( remoteAddr == null || "".equals(remoteAddr) ) remoteAddr = request.getRemoteAddr(); ImpalaConnector.databaseLock.lock(); Connection con = ImpalaConnector.getInstance().getConnection(); if ( con == null ) { ImpalaConnector.databaseLock.unlock(); logger.error("Problem when creating the Impala-connection!"); return UploadFullTextsResponse.databaseError; } String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ?" ; PreparedStatement getFileLocationForHashPreparedStatement = null; try { getFileLocationForHashPreparedStatement = con.prepareStatement(getFileLocationForHashQuery); } catch (SQLException sqle) { ImpalaConnector.databaseLock.unlock(); logger.error("Problem when creating the prepared statement for \"" + getFileLocationForHashQuery + "\"!\n" + sqle.getMessage()); return UploadFullTextsResponse.databaseError; } // Get the file-locations. int numFullTextUrlsFound = 0; int numFilesFoundFromPreviousAssignmentsBatches = 0; HashMultimap allFileNamesWithIDsHashMap = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. for ( UrlReport urlReport : urlReports ) { UrlReport.StatusType statusType = urlReport.getStatus(); if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) { continue; } numFullTextUrlsFound ++; Payload payload = urlReport.getPayload(); if ( payload != null ) { String fileLocation = null; // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker. // Use the same prepared-statement for all requests, to improve speed (just like when inserting similar thing to the DB). String fileHash = payload.getHash(); if ( fileHash != null ) { try { getFileLocationForHashPreparedStatement.setString(1, fileHash); } catch (SQLException sqle) { logger.error("Error when setting the parameter in \"getFileLocationForHashQuery\"!\n" + sqle.getMessage()); } try ( ResultSet resultSet = getFileLocationForHashPreparedStatement.executeQuery() ) { if ( resultSet.next() ) { // Move the "cursor" to the first row. If there is any data.. fileLocation = resultSet.getString(1); if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded. payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG! numFilesFoundFromPreviousAssignmentsBatches ++; continue; } } } catch (Exception e) { logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n" + e.getMessage()); // TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database?? // TODO - Since the database will have problems.. there is not point in trying to insert the payloads to Impala (handling it like we tried to insert and got an error). // TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not autoclosed here)and the Database connection. } } // If the full-text of this record was not found by a previous batch.. fileLocation = payload.getLocation(); if ( fileLocation != null ) { // If the docFile was downloaded (without an error).. Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation); if ( !matcher.matches() ) { continue; } String fileNameWithExtension = matcher.group(1); if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { continue; } allFileNamesWithIDsHashMap.put(fileNameWithExtension, payload.getId()); // The keys and the values are not duplicate. Task with ID-1 might have an "ID-1.pdf" file. // While a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1. } } } // Close the Prepared Statement. try { if ( getFileLocationForHashPreparedStatement != null ) getFileLocationForHashPreparedStatement.close(); } catch (SQLException sqle) { logger.error("Failed to close the \"getFileLocationForHashPreparedStatement\"!\n" + sqle.getMessage()); } finally { ImpalaConnector.databaseLock.unlock(); // The rest work of this function does not use the database. ImpalaConnector.closeConnection(con); } logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ")."); logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches); ArrayList allFileNames = new ArrayList<>(allFileNamesWithIDsHashMap.keySet()); int numAllFullTexts = allFileNames.size(); if ( numAllFullTexts == 0 ) { logger.warn("The retrieved files where < 0 > for assignments_" + assignmentsBatchCounter + " | from worker: " + workerId); return UploadFullTextsResponse.successful; // It was handled, no error. } // Request the full-texts in batches, compressed in zip. int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch); if ( (numAllFullTexts % numOfFullTextsPerBatch) > 0 ) // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are. numOfBatches ++; logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches."); // Check if one full text is left out because of the division. Put it int the last batch. String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/"; // Index all Payloads to be more efficiently searched later. HashMultimap payloadsHashMultimap = HashMultimap.create((urlReports.size() / 3), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload != null ) payloadsHashMultimap.put(payload.getId(), payload); } String curAssignmentsBaseLocation = baseTargetLocation + "assignments_" + assignmentsBatchCounter + File.separator; File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation); int failedBatches = 0; for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) { List fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter); HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId); if ( conn == null ) { updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch); failedBatches ++; continue; // To the next batch. } // Get the extracted files. String targetDirectory = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator; try { // Create this batch-directory. Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory)); // Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url String zipFileFullPath = targetDirectory + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip"; File zipFile = new File(zipFileFullPath); if ( ! saveZipFile(conn, zipFile) ) { updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch); failedBatches ++; continue; // To the next batch. } //logger.debug("The zip file has been saved: " + zipFileFullPath); // DEBUG! FileUnZipper.unzipFolder(Paths.get(zipFileFullPath), curBatchPath); String[] fileNames = new File(targetDirectory).list(); if ( (fileNames == null) || (fileNames.length <= 1 ) ) { // The directory might have only one file, the "zip-file". logger.error("No full-text fileNames where extracted from directory: " + targetDirectory); updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch); failedBatches ++; continue; // To the next batch. } // Iterate over the files and upload them to S3. int numUploadedFiles = 0; for ( String fileName : fileNames ) { String fileFullPath = targetDirectory + fileName; if ( fileFullPath.equals(zipFileFullPath) ) // Exclude the zip-file from uploading. continue; // Check if this stored file is related to one or more IDs from the Set. Set fileRelatedIDs = allFileNamesWithIDsHashMap.get(fileName); if ( fileRelatedIDs.isEmpty() ) { // In case the "fileName" is not inside the "allFileNamesWithIDsHashMap" HashMultimap. logger.error("The stored file \"" + fileName + "\" is not related to any ID which had a file requested from the Worker!"); continue; } if ( isFileNameProblematic(fileName, payloadsHashMultimap) ) // Do some more checks. continue; // At this point, we know that this file is related with one or more IDs of the payloads AND it has a valid fileName. // Let's try to upload the file to S3 and update the payloads of all related IDs, either in successful upload or not. String s3Url = S3ObjectStoreMinIO.uploadToS3(fileName, fileFullPath); if ( s3Url != null ) { setFullTextForMultipleIDs(payloadsHashMultimap, fileRelatedIDs, s3Url); // It checks weather (s3Url != null) and acts accordingly. numUploadedFiles++; } // Else, the record will have its file-data set to "null", in the end of this method. } logger.info("Finished uploading " + numUploadedFiles + " full-texts (out of " + (fileNames.length -1) + " distinct files) from assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore."); // (fileNames.length -1) --> minus the zip-file } catch (Exception e) { logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMultimap, fileNamesForCurBatch); failedBatches ++; } } // End of batches. // Delete this assignments-num directory. deleteDirectory(curAssignmentsBaseDir); // Check if none of the batches were handled.. if ( failedBatches == numOfBatches ) { logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId); return UploadFullTextsResponse.unsuccessful; } else { replaceNotUploadedFileLocations(urlReports); // Make sure all records without an s3Url have < null > file-data. return UploadFullTextsResponse.successful; } } private static HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List fileNamesForCurBatch, int totalBatches, String workerId) { baseUrl += batchNum + "/"; String requestUrl = getRequestUrlForBatch(baseUrl, fileNamesForCurBatch); logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]"); HttpURLConnection conn = null; try { conn = (HttpURLConnection) new URL(requestUrl).openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", "UrlsController"); conn.connect(); int statusCode = conn.getResponseCode(); if ( statusCode != 200 ) { logger.warn("HTTP-" + statusCode + ": " + getErrorMessageFromResponseBody(conn) + "\nProblem when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl); return null; } } catch (Exception e) { logger.warn("Problem when requesting the ZipFile of batch_" + batchNum + " of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl + "\n" + e.getMessage()); return null; } return conn; } private static String getErrorMessageFromResponseBody(HttpURLConnection conn) { StringBuilder errorMsgStrB = new StringBuilder(500); try ( BufferedReader br = new BufferedReader(new InputStreamReader(conn.getErrorStream())) ) { // Try-with-resources String inputLine; while ( (inputLine = br.readLine()) != null ) { if ( !inputLine.isEmpty() ) errorMsgStrB.append(inputLine); } return (errorMsgStrB.length() != 0) ? errorMsgStrB.toString() : null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function. } catch ( IOException ioe ) { logger.error("IOException when retrieving the error-message: " + ioe.getMessage()); return null; } catch ( Exception e ) { logger.error("Could not extract the errorMessage!", e); return null; } } private static List getFileNamesForBatch(List allFileNames, int numAllFullTexts, int curBatch) { int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch); int endingIndex = (curBatch * numOfFullTextsPerBatch); if ( endingIndex > numAllFullTexts ) // This might be the case, when the "numAllFullTexts" is too small. endingIndex = numAllFullTexts; List fileNamesOfCurBatch = new ArrayList<>(numOfFullTextsPerBatch); for ( int i = initialIndex; i < endingIndex; ++i ) { try { fileNamesOfCurBatch.add(allFileNames.get(i)); } catch (IndexOutOfBoundsException ioobe) { logger.error("IOOBE for i=" + i + "\n" + ioobe.getMessage(), ioobe); } } return fileNamesOfCurBatch; } private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50); // TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests. private static String getRequestUrlForBatch(String baseUrl, List fileNamesForCurBatch) { sb.append(baseUrl); int numFullTextsCurBatch = fileNamesForCurBatch.size(); for ( int j=0; j < numFullTextsCurBatch; ++j ){ sb.append(fileNamesForCurBatch.get(j)); if ( j < (numFullTextsCurBatch -1) ) sb.append(","); } String requestUrl = sb.toString(); sb.setLength(0); // Reset for the next batch. return requestUrl; } private static final int bufferSize = 20971520; // 20 MB public static boolean saveZipFile(HttpURLConnection conn, File zipFile) { InputStream inStream = null; FileOutputStream outStream = null; try { inStream = conn.getInputStream(); outStream = new FileOutputStream(zipFile); byte[] byteBuffer = new byte[bufferSize]; // 20 MB int bytesRead = -1; while ( (bytesRead = inStream.read(byteBuffer, 0, bufferSize)) != -1 ) { outStream.write(byteBuffer, 0, bytesRead); } return true; } catch (Exception e) { logger.error("Could not save the zip file \"" + zipFile.getName() + "\": " + e.getMessage(), e); return false; } finally { try { if ( inStream != null ) inStream.close(); if ( outStream != null ) outStream.close(); } catch (Exception e) { logger.error(e.getMessage(), e); } } } private static boolean isFileNameProblematic(String fileName, HashMultimap payloadsHashMultimap) { // Get the ID of the file. Matcher matcher = FILENAME_ID.matcher(fileName); if ( !matcher.matches() ) { logger.error("The given fileName \"" + fileName + "\" was invalid! Could not be matched with matcher: " + matcher); return true; } String fileID = matcher.group(1); if ( (fileID == null) || fileID.isEmpty() ) { logger.error("The given fileName \"" + fileName + "\" was invalid. No fileID was extracted!"); return true; } // Take the payloads which are related with this ID. An ID might have multiple original-urls, thus multiple payloads. // The ID we have here, is the one from the first record which reached to this file. // There might be other records pointing to this file. But, in order to mark this file as "valid", we have to match it with at least one of the records-IDs. // We do this process to avoid handling and uploading irrelevant files which could find their way to the working directory (either because of a Worker's error or any other type of malfunction or even malicious action). Set payloads = payloadsHashMultimap.get(fileID); if ( payloads.isEmpty() ) { logger.error("The given fileID \"" + fileID + "\" was not part of the \"payloadsHashMultimap\"!"); return true; } // Search through the payloads to find at least one match, in order for this file to NOT be "problematic". for ( Payload payload : payloads ) { String location = payload.getLocation(); if ( (location != null) && location.endsWith(fileName) ) return false; // It's not problematic. } logger.error("None of the locations of the payloads matched with the ID \"" + fileID + "\" are ending with the filename \"" + fileName + "\" they were supposed to."); return true; } /** * This method updates the UrlReports to not point to any downloaded fullText files. * This is useful when the uploading process of the fullTexts to the S3-ObjectStore fails. * Then, we don't want any "links" to locally stored files, which will be deleted. * @param urlReports * @return */ public static void updateUrlReportsToHaveNoFullTextFiles(List urlReports) { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload != null ) setUnretrievedFullText(payload); } } private static void replaceNotUploadedFileLocations(List urlReports) { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload != null ) { String fileLocation = payload.getLocation(); if ( (fileLocation != null) && (! fileLocation.startsWith(S3ObjectStoreMinIO.endpoint)) ) setUnretrievedFullText(payload); } } } public static void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMultimap payloadsHashMultimap, List fileNames) { for ( String fileName : fileNames ) { // Get the ID of the file. Matcher matcher = FILENAME_ID.matcher(fileName); if ( !matcher.matches() ) { continue; } String id = matcher.group(1); if ( (id == null) || id.isEmpty() ) { continue; } Set payloads = payloadsHashMultimap.get(id); // Set for all payloads connected to this ID. for ( Payload payload : payloads ) if ( payload != null ) setUnretrievedFullText(payload); // It changes the payload in the original UrlReport list. } } public static void setUnretrievedFullText(Payload payload) { // Mark the full-text as not-retrieved, since it will be deleted from local-storage. The retrieved link to the full-text will be kept. payload.setLocation(null); payload.setHash(null); payload.setMime_type(null); payload.setSize(null); } /** * Set the fileLocation for all those IDs related to the File. The IDs may have one or more payloads. * @param payloadsHashMultimap * @param fileIDs * @param s3Url */ public static void setFullTextForMultipleIDs(HashMultimap payloadsHashMultimap, Set fileIDs, String s3Url) { for ( String id : fileIDs ) { Set payloads = payloadsHashMultimap.get(id); if ( payloads.isEmpty() ) { logger.error("The given id \"" + id + "\" (coming from the \"allFileNamesWithIDsHashMap\"), is not found inside the \"payloadsHashMultimap\"!"); continue; } for ( Payload payload : payloads ) if ( payload.getHash() != null ) // Update only for the records which led to a file, not all the records of this ID (an ID might have multiple original_urls pointing to different directions). payload.setLocation(s3Url); // Update the file-location to the new S3-url. All the other file-data is already set from the Worker. } } public static boolean deleteDirectory(File curBatchDir) { try { org.apache.commons.io.FileUtils.deleteDirectory(curBatchDir); return true; } catch (IOException e) { logger.error("The following directory could not be deleted: " + curBatchDir.getName(), e); return false; } } private static String getCutBatchExceptionMessage(String sqleMessage) { // The sqleMessage contains the actual message followed by the long batch. This makes the logs unreadable. So we should shorten the message before logging. int maxEnding = 1500; if ( sqleMessage.length() > maxEnding ) return (sqleMessage.substring(0, maxEnding) + "..."); else return sqleMessage; } // This is currently not used, but it may be useful in a future scenario. private static long getInputFileLinesNum() { long numOfLines = 0; try { numOfLines = Files.lines(Paths.get(inputFileFullPath)).count(); logger.debug("The numOfLines in the inputFile is " + numOfLines); } catch (IOException e) { logger.error("Could not retrieve the numOfLines. " + e); return -1; } return numOfLines; } /** * This method decodes a Json String and returns its members. * @param jsonLine String * @return HashMap */ public static Task jsonDecoder(String jsonLine) { // Get ID and url and put them in the HashMap String idStr = null; String urlStr = null; try { JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine. idStr = jObj.get("id").toString(); urlStr = jObj.get("url").toString(); } catch (JSONException je) { logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je); return null; } if ( urlStr.isEmpty() ) { if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. logger.warn("The url was not found for id: \"" + idStr + "\""); return null; } return new Task(idStr, urlStr, null); } /** * This method parses a Json file and extracts the urls, along with the IDs. * @return HashMultimap */ public static HashMultimap getNextIdUrlPairBatchFromJson() { Task inputIdUrlTuple; int expectedPathsPerID = 5; int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); int curBeginning = fileIndex.get(); while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) ) {// While (!EOF) and inside the current url-batch, iterate through lines. //logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG! // Take each line, remove potential double quotes. String retrievedLineStr = inputScanner.get().nextLine(); //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! fileIndex.set(fileIndex.get() +1); if ( retrievedLineStr.isEmpty() ) { unretrievableInputLines.set(unretrievableInputLines.get() +1); continue; } if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes. logger.warn("A problematic inputLine found: \t" + retrievedLineStr); unretrievableInputLines.set(unretrievableInputLines.get() +1); continue; } if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl. duplicateIdUrlEntries.set(duplicateIdUrlEntries.get() +1); } } return idAndUrlMappedInput; } /** * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. * @return loadedUrls */ public static int getCurrentlyLoadedUrls() // In the end, it gives the total number of urls we have processed. { return FileUtils.fileIndex.get() - FileUtils.unretrievableInputLines.get(); } /** * This method checks if there is no more input-data and returns true in that case. * Otherwise, it returns false, if there is more input-data to be loaded. * A "RuntimeException" is thrown if no input-urls were retrieved in general. * @param isEmptyOfData * @param isFirstRun * @return finished loading / not finished * @throws RuntimeException */ public static boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) { if ( isEmptyOfData ) { if ( isFirstRun ) logger.error("Could not retrieve any urls from the inputFile!"); else logger.debug("Done loading " + FileUtils.getCurrentlyLoadedUrls() + " urls from the inputFile."); return true; } return false; } }