- Prepare version for next release.

- Fix typo of not using the "OpenAireID" in the S3 location of bulkImported files. Instead, the "fileNameID" was used, which in aggregation is the OpenAireID, but not in bulk-import.
- Update dependencies.
- Code polishing.
master
Lampros Smyrnaios 1 month ago
parent 8bc5cc35e2
commit 1d821ed803

@ -6,7 +6,7 @@ plugins {
java {
group = 'eu.openaire.urls_controller'
version = '2.6.3-SNAPSHOT'
version = '2.7.0-SNAPSHOT'
sourceCompatibility = JavaVersion.VERSION_1_8
}
@ -18,6 +18,10 @@ repositories {
}
}
ext {
hadoopVersion = '3.4.0'
}
dependencies {
runtimeOnly "org.springframework.boot:spring-boot-devtools"
@ -76,7 +80,7 @@ dependencies {
implementation('org.apache.parquet:parquet-avro:1.13.1')
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
implementation('org.apache.hadoop:hadoop-common:3.3.6') {
implementation("org.apache.hadoop:hadoop-common:$hadoopVersion") {
exclude group: 'org.apache.parquet', module: 'parquet-avro'
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-api'
@ -92,7 +96,7 @@ dependencies {
}
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core
implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') {
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") {
exclude group: 'org.apache.parquet', module: 'parquet-avro'
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-api'
@ -106,17 +110,16 @@ dependencies {
// Add back some updated version of the needed dependencies.
implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1'
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.2'
// https://mvnrepository.com/artifact/org.json/json
implementation 'org.json:json:20240303'
implementation 'org.json:json:20240303' // This is used only in "ParquetFileUtils.createRemoteParquetDirectories()". TODO - Replace it with "gson".
// https://mvnrepository.com/artifact/com.google.code.gson/gson
implementation 'com.google.code.gson:gson:2.10.1'
// https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3'
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.4'
testImplementation 'org.springframework.security:spring-security-test'
testImplementation "org.springframework.boot:spring-boot-starter-test"

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME

@ -270,7 +270,7 @@ public class BulkImportServiceImpl implements BulkImportService {
failedFiles.add(fileLocation);
}
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and right it to the file.
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and write it to the file.
msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
if ( logger.isTraceEnabled() )
logger.trace(msg + additionalLoggingMsg);
@ -363,8 +363,7 @@ public class BulkImportServiceImpl implements BulkImportService {
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
throws ConnectException, UnknownHostException
{
File fullTextFile = new File(fileLocation);
DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
docFileData.calculateAndSetHashAndSize();
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
@ -421,7 +420,7 @@ public class BulkImportServiceImpl implements BulkImportService {
s3Url = alreadyFoundFileLocation;
} else
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.

@ -466,7 +466,7 @@ public class FileUtils {
}
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
String fileNameID = matcher.group(4);
String fileNameID = matcher.group(4); // The "fileNameID" is the OpenAIRE_ID for this file.
if ( (fileNameID == null) || fileNameID.isEmpty() ) {
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
continue;
@ -521,10 +521,10 @@ public class FileUtils {
}
public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID,
public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String openAireId,
String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException
{
String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store.
String filenameForS3 = constructS3FileName(fileName, openAireId, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store.
if ( filenameForS3 == null ) // The error is logged inside.
return null;

Loading…
Cancel
Save