- Optimize writing to the Bulk-import-report file.

- Show the IP of the worker which posts a "workerShutdownReport".
- Code polishing.
master
Lampros Smyrnaios 1 month ago
parent b9b29dd51c
commit 8bc5cc35e2

@ -1,4 +1,5 @@
# UrlsController [![Build Status](https://jenkins-dnet.d4science.org/buildStatus/icon?job=UrlsController)](https://jenkins-dnet.d4science.org/job/UrlsController/)
# UrlsController
## [![Jenkins build status](https://jenkins-dnet.d4science.org/buildStatus/icon?job=UrlsController)](https://jenkins-dnet.d4science.org/job/UrlsController/)
The Controller's Application receives requests coming from the [**Workers**](https://code-repo.d4science.org/lsmyrnaios/UrlsWorker) (deployed on the cloud), constructs an assignments-list with data received from a database and returns the list to the workers.<br>
Then, it receives the "WorkerReports", it requests the full-texts from the workers, in batches, and uploads them on the S3-Object-Store. Finally, it writes the related reports, along with the updated file-locations into the database.<br>

@ -136,7 +136,8 @@ public class ShutdownController {
@PostMapping("workerShutdownReport")
public ResponseEntity<?> workerShutdownReport(@RequestParam String workerId, HttpServletRequest request)
{
String initMsg = "Received a \"workerShutdownReport\" from worker: \"" + workerId + "\".";
String remoteAddr = GenericUtils.getRequestorAddress(request);
String initMsg = "Received a \"workerShutdownReport\" from worker: \"" + workerId + "\" [IP: " + remoteAddr + "].";
WorkerInfo workerInfo = UrlsController.workersInfoMap.get(workerId);
if ( workerInfo == null ) {
String errorMsg = "The worker with id \"" + workerId + "\" has not participated in the PDF-Aggregation-Service!";
@ -144,9 +145,8 @@ public class ShutdownController {
return ResponseEntity.badRequest().body(errorMsg);
}
String remoteAddr = GenericUtils.getRequestorAddress(request);
if ( ! remoteAddr.equals(workerInfo.getWorkerIP()) ) {
logger.error(initMsg + " The request came from another IP: " + remoteAddr + " | while this worker was registered with this IP: " + workerInfo.getWorkerIP());
logger.error(initMsg + " The request came from an IP different from the one this worker was registered with: " + workerInfo.getWorkerIP());
return ResponseEntity.status(HttpStatus.FORBIDDEN).build();
}

@ -43,7 +43,7 @@ public class BulkImportReport {
public void addEvent(String event) {
eventsMultimap.put(GenericUtils.getReadableCurrentTimeAndZone(), event);
eventsMultimap.put(GenericUtils.getReadableCurrentTimeAndZone(), event); // This is synchronized.
}
public String getJsonReport()

@ -78,7 +78,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "The payloadsSchema could not be parsed!";
logger.error(errorMsg + additionalLoggingMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -88,7 +88,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "Could not retrieve the files for bulk-import!";
logger.error(errorMsg + additionalLoggingMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -98,7 +98,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "No files were found inside the bulkImportDir: " + bulkImportDirName;
logger.warn(errorMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -113,7 +113,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "Could not create the local parquet-directory: " + localParquetDir;
logger.error(errorMsg + additionalLoggingMsg, e);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -124,7 +124,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "Could not create the remote HDFS-directory: " + currentBulkImportHdfsDir;
logger.error(errorMsg + additionalLoggingMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -139,7 +139,7 @@ public class BulkImportServiceImpl implements BulkImportService {
msg = "Going to bulk-import the " + numOfFiles + " files in parallel, after dividing them in " + subListsSize + " segments.";
logger.debug(msg + additionalLoggingMsg);
bulkImportReport.addEvent(msg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
for ( int i = 0; i < subListsSize; ++i ) {
int finalI = i;
@ -180,7 +180,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "An error occurred when trying to bulk-import data from bulkImportDir: " + bulkImportDirName;
logger.error(errorMsg, e);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
} finally {
@ -193,7 +193,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = "None of the files inside the bulkImportDir: " + bulkImportDirName + " were imported!";
logger.error(errorMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
} else if ( numAllFailedFiles > 0 ) { // Some failed, but not all.
@ -204,7 +204,7 @@ public class BulkImportServiceImpl implements BulkImportService {
logger.info(msg);
}
bulkImportReport.addEvent(msg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
// Merge the parquet files inside the table "payload_bulk_import", to improve performance of future operations.
DatabaseConnector.databaseLock.lock();
@ -212,7 +212,7 @@ public class BulkImportServiceImpl implements BulkImportService {
DatabaseConnector.databaseLock.unlock();
if ( mergeErrorMsg != null ) { // the message in already logged
bulkImportReport.addEvent(mergeErrorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
return false;
}
@ -220,7 +220,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String successMsg = "Finished the bulk-import procedure for " + provenance + " and bulkImportDir: " + bulkImportDirName;
logger.info(successMsg);
bulkImportReport.addEvent(successMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), false);
// The report-file will be overwritten every now and then, instead of appended, since we want to add an update new JSON report-object each time.
// Also, we do not want to write the object in the end (in its final form), since we want the user to have the ability to request the report at any time,
// after submitting the bulk-import request, to see its progress (since the number of file may be very large and the processing may take many hours).
@ -240,6 +240,7 @@ public class BulkImportServiceImpl implements BulkImportService {
String msg = "Going to import " + numOfFilesInSegment + " files, for segment-" + segmentCounter;
logger.debug(msg + additionalLoggingMsg);
bulkImportReport.addEvent(msg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
List<GenericData.Record> payloadRecords = new ArrayList<>(numOfFilesInSegment);
@ -254,7 +255,7 @@ public class BulkImportServiceImpl implements BulkImportService {
try {
record = processBulkImportedFile(fileLocation, provenance, bulkImportSource, timeMillis, additionalLoggingMsg);
} catch (Exception e) {
String errorMsg = "Exception when uploading the files of segment_" + segmentCounter + " to the S3 Object Store. Will avoid uploading any file for this segment..";
String errorMsg = "Exception when uploading the files of segment_" + segmentCounter + " to the S3 Object Store. Will avoid uploading the rest of the files for this segment..";
logger.error(errorMsg + additionalLoggingMsg);
bulkImportReport.addEvent(errorMsg);
for ( int j=i; j < numOfFilesInSegment; ++j )
@ -269,14 +270,14 @@ public class BulkImportServiceImpl implements BulkImportService {
failedFiles.add(fileLocation);
}
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment.
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and right it to the file.
msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
if ( logger.isTraceEnabled() )
logger.trace(msg + additionalLoggingMsg);
bulkImportReport.addEvent(msg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
}
}
} // End of processing files for this segment.
int numOfPayloadRecords = payloadRecords.size();
if ( numOfPayloadRecords == 0 ) {
@ -291,7 +292,6 @@ public class BulkImportServiceImpl implements BulkImportService {
String errorMsg = failedFiles.size() + " out of " + numOfFilesInSegment + " files failed to be bulk-imported, for segment-" + segmentCounter + " !";
logger.warn(errorMsg + additionalLoggingMsg);
bulkImportReport.addEvent(errorMsg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
}
// Construct the parquet file, upload it to HDFS and load it in the "payload_bulk_import" table.
@ -350,11 +350,12 @@ public class BulkImportServiceImpl implements BulkImportService {
// Delete all files except the ones in the "failedHashSet".
for ( String fileLocation : fileLocationsSegment ) {
if ( !failedFiles.contains(fileLocation) )
if ( !fileUtils.deleteFile(fileLocation) )
if ( !fileUtils.deleteFile(fileLocation) ) // The "error-log-message" is shown inside.
bulkImportReport.addEvent("The file " + fileLocation + " could not be deleted! Please make sure you have provided the WRITE-permission.");
}
}
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
return (numOfFilesInSegment - numOfPayloadRecords); // Return the numOfFailedFiles.
}
@ -385,6 +386,10 @@ public class BulkImportServiceImpl implements BulkImportService {
String datasourcePrefix = bulkImportSource.getDatasourcePrefix();
String fileNameID = fileLocationData.getFileNameID();
String openAireId = generateOpenaireId(fileNameID, datasourcePrefix, bulkImportSource.getIsAuthoritative());
if ( openAireId == null )
return null;
String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.
@ -403,10 +408,6 @@ public class BulkImportServiceImpl implements BulkImportService {
DatabaseConnector.databaseLock.unlock();
}
String openAireId = generateOpenaireId(fileNameID, datasourcePrefix, bulkImportSource.getIsAuthoritative());
if ( openAireId == null )
return null;
String s3Url = null;
if ( alreadyFoundFileLocation != null ) // If the full-text of this record is already-found and uploaded.

@ -287,7 +287,7 @@ public class UrlsServiceImpl implements UrlsService {
} else if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.unsuccessful ) {
logger.error("Failed to get and/or upload the fullTexts for batch-assignments_" + curReportAssignmentsCounter);
// The docUrls were still found! Just update ALL the fileLocations, sizes, hashes and mimetypes, to show that the files are not available.
fileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports, false);
fileUtils.removeUnretrievedFullTextsFromUrlReports(urlReports, false);
// We write only the payloads which are connected with retrieved full-texts, uploaded to S3-Object-Store.
// We continue with writing the "attempts", as we want to avoid re-checking the failed-urls later.
// The urls which give full-text (no matter if we could not get it from the worker), are flagged as "couldRetry" anyway, so they will be picked-up to be checked again later.

@ -243,7 +243,7 @@ public class FileUtils {
if ( failedBatches == numOfBatches )
logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId);
updateUrlReportsToHaveNoFullTextFiles(urlReports, true); // Make sure all records without an S3-Url have < null > file-data (some batches or uploads might have failed).
removeUnretrievedFullTextsFromUrlReports(urlReports, true); // Make sure all records without an S3-Url have < null > file-data (some batches or uploads might have failed).
deleteDirectory(new File(curAssignmentsBaseLocation));
// Check and warn about the number of failed payloads.
@ -617,7 +617,7 @@ public class FileUtils {
public static final int twentyFiveKb = 25_600; // 25 Kb
public static final int halfMb = 524_288; // 0.5 Mb = 512 Kb
public static final int halfMb = 524_288; // 0.5 Mb = 512 Kb = 524_288 bytes
public static final int tenMb = (10 * 1_048_576);
public boolean saveArchive(HttpURLConnection conn, File zstdFile)
@ -644,7 +644,7 @@ public class FileUtils {
* @param urlReports
* @param shouldCheckAndKeepS3UploadedFiles
*/
public void updateUrlReportsToHaveNoFullTextFiles(List<UrlReport> urlReports, boolean shouldCheckAndKeepS3UploadedFiles)
public void removeUnretrievedFullTextsFromUrlReports(List<UrlReport> urlReports, boolean shouldCheckAndKeepS3UploadedFiles)
{
for ( UrlReport urlReport : urlReports ) {
Payload payload = urlReport.getPayload();
@ -703,9 +703,11 @@ public class FileUtils {
// Get the id and url of any
String getDataForPayloadPrefillQuery = "select distinct pu.id, pu.url\n" +
"from " + DatabaseConnector.databaseName + ".publication_urls pu\n" +
// Exclude the "already-processed" pairs.
"left anti join " + DatabaseConnector.databaseName + ".attempt a on a.id=pu.id and a.original_url=pu.url\n" +
"left anti join " + DatabaseConnector.databaseName + ".payload p on p.id=pu.id and p.original_url=pu.url\n" +
"left anti join " + DatabaseConnector.databaseName + ".assignment asgn on asgn.id=pu.id and asgn.original_url=pu.url\n" +
// Limit the urls to the ones matching to the payload-urls found for the current assignments.
"where pu.url in " + getQueryListString(urlsToRetrieveRelatedIDs, urlsToRetrieveRelatedIDsSize, stringBuilderCapacity);
//logger.trace("getDataForPayloadPrefillQuery:\n" + getDataForPayloadPrefillQuery);

Loading…
Cancel
Save