Move similar code into the new "ParquetFileUtils.getPayloadParquetRecord()" method.

This commit is contained in:
Lampros Smyrnaios 2023-10-17 12:50:51 +03:00
parent f05eee7569
commit 40729c6295
2 changed files with 32 additions and 28 deletions

View File

@ -418,18 +418,8 @@ public class BulkImportServiceImpl implements BulkImportService {
} else
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
GenericData.Record record = new GenericData.Record(ParquetFileUtils.payloadsSchema);
record.put("id", openAireId);
record.put("original_url", originalUrl);
record.put("actual_url", actualUrl);
record.put("date", timeMillis);
record.put("mimetype", bulkImportSource.getMimeType());
Long size = docFileData.getSize();
record.put("size", ((size != null) ? String.valueOf(size) : null));
record.put("hash", fileHash); // This is already checked and will not be null here.
record.put("location", s3Url);
record.put("provenance", ("bulk:" + provenance)); // Add the "bulk:" prefix in order to be more clear that this record comes from bulkImport, when looking all records in the "payload" VIEW.
return record;
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.
}

View File

@ -306,23 +306,13 @@ public class ParquetFileUtils {
if ( fileLocation == null ) // We want only the records with uploaded full-texts in the "payload" table.
continue;
try {
record = new GenericData.Record(payloadsSchema);
record.put("id", payload.getId());
record.put("original_url", payload.getOriginal_url());
record.put("actual_url", payload.getActual_url());
Timestamp timestamp = payload.getTimestamp_acquired();
record.put("date", (timestamp != null) ? timestamp.getTime() : System.currentTimeMillis());
record.put("mimetype", payload.getMime_type());
Long size = payload.getSize();
record.put("size", ((size != null) ? String.valueOf(size) : null));
record.put("hash", payload.getHash());
record.put("location", fileLocation);
record.put("provenance", payload.getProvenance());
Timestamp timestamp = payload.getTimestamp_acquired();
record = getPayloadParquetRecord(payload.getId(), payload.getOriginal_url(), payload.getActual_url(),
(timestamp != null) ? timestamp.getTime() : System.currentTimeMillis(),
payload.getMime_type(), payload.getSize(), payload.getHash(), fileLocation, payload.getProvenance(), false);
if ( record != null )
recordList.add(record);
} catch (Exception e) {
logger.error("Failed to create a payload record!", e);
}
}
int recordsSize = recordList.size();
@ -349,6 +339,30 @@ public class ParquetFileUtils {
}
public GenericData.Record getPayloadParquetRecord(String id, String original_url, String actual_url, long timeMillis, String mimetype, Long size,
String hash, String fileLocation, String provenance, boolean isForBulkImport)
{
GenericData.Record record;
try {
record = new GenericData.Record(payloadsSchema);
record.put("id", id);
record.put("original_url", original_url);
record.put("actual_url", actual_url);
record.put("date", timeMillis);
record.put("mimetype", mimetype);
record.put("size", ((size != null) ? String.valueOf(size) : null));
record.put("hash", hash);
record.put("location", fileLocation);
record.put("provenance", (isForBulkImport ? "bulk:" : "") + provenance);
// Add the "bulk:" prefix in order to be more clear that this record comes from bulkImport, when looking all records in the "payload" VIEW.
return record;
} catch (Exception e) {
logger.error("Failed to create a payload record!", e);
return null;
}
}
public boolean writeToParquet(List<GenericData.Record> recordList, Schema schema, String fullFilePath)
{
OutputFile outputFile;