- Prepare version for next release.

- Fix typo of not using the "OpenAireID" in the S3 location of bulkImported files. Instead, the "fileNameID" was used, which in aggregation is the OpenAireID, but not in bulk-import.
- Update dependencies.
- Code polishing.
This commit is contained in:
Lampros Smyrnaios 2024-03-28 06:09:28 +02:00
parent 8bc5cc35e2
commit 1d821ed803
4 changed files with 17 additions and 15 deletions

View File

@ -6,7 +6,7 @@ plugins {
java { java {
group = 'eu.openaire.urls_controller' group = 'eu.openaire.urls_controller'
version = '2.6.3-SNAPSHOT' version = '2.7.0-SNAPSHOT'
sourceCompatibility = JavaVersion.VERSION_1_8 sourceCompatibility = JavaVersion.VERSION_1_8
} }
@ -18,6 +18,10 @@ repositories {
} }
} }
ext {
hadoopVersion = '3.4.0'
}
dependencies { dependencies {
runtimeOnly "org.springframework.boot:spring-boot-devtools" runtimeOnly "org.springframework.boot:spring-boot-devtools"
@ -76,7 +80,7 @@ dependencies {
implementation('org.apache.parquet:parquet-avro:1.13.1') implementation('org.apache.parquet:parquet-avro:1.13.1')
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
implementation('org.apache.hadoop:hadoop-common:3.3.6') { implementation("org.apache.hadoop:hadoop-common:$hadoopVersion") {
exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.parquet', module: 'parquet-avro'
exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-api' exclude group: 'org.slf4j', module: 'slf4j-api'
@ -92,7 +96,7 @@ dependencies {
} }
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core
implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') { implementation("org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") {
exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.parquet', module: 'parquet-avro'
exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-api' exclude group: 'org.slf4j', module: 'slf4j-api'
@ -106,17 +110,16 @@ dependencies {
// Add back some updated version of the needed dependencies. // Add back some updated version of the needed dependencies.
implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8. implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1' implementation 'com.fasterxml.woodstox:woodstox-core:6.6.2'
// https://mvnrepository.com/artifact/org.json/json // https://mvnrepository.com/artifact/org.json/json
implementation 'org.json:json:20240303' implementation 'org.json:json:20240303' // This is used only in "ParquetFileUtils.createRemoteParquetDirectories()". TODO - Replace it with "gson".
// https://mvnrepository.com/artifact/com.google.code.gson/gson // https://mvnrepository.com/artifact/com.google.code.gson/gson
implementation 'com.google.code.gson:gson:2.10.1' implementation 'com.google.code.gson:gson:2.10.1'
// https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus // https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3' runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.4'
testImplementation 'org.springframework.security:spring-security-test' testImplementation 'org.springframework.security:spring-security-test'
testImplementation "org.springframework.boot:spring-boot-starter-test" testImplementation "org.springframework.boot:spring-boot-starter-test"

View File

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
networkTimeout=10000 networkTimeout=10000
validateDistributionUrl=true validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME

View File

@ -270,7 +270,7 @@ public class BulkImportServiceImpl implements BulkImportService {
failedFiles.add(fileLocation); failedFiles.add(fileLocation);
} }
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and right it to the file. if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and write it to the file.
msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files."; msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
if ( logger.isTraceEnabled() ) if ( logger.isTraceEnabled() )
logger.trace(msg + additionalLoggingMsg); logger.trace(msg + additionalLoggingMsg);
@ -363,8 +363,7 @@ public class BulkImportServiceImpl implements BulkImportService {
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg) private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
throws ConnectException, UnknownHostException throws ConnectException, UnknownHostException
{ {
File fullTextFile = new File(fileLocation); DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
docFileData.calculateAndSetHashAndSize(); docFileData.calculateAndSetHashAndSize();
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded. // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
@ -421,7 +420,7 @@ public class BulkImportServiceImpl implements BulkImportService {
s3Url = alreadyFoundFileLocation; s3Url = alreadyFoundFileLocation;
} else } else
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash); s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(), return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null. docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.

View File

@ -466,7 +466,7 @@ public class FileUtils {
} }
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used. // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName). // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
String fileNameID = matcher.group(4); String fileNameID = matcher.group(4); // The "fileNameID" is the OpenAIRE_ID for this file.
if ( (fileNameID == null) || fileNameID.isEmpty() ) { if ( (fileNameID == null) || fileNameID.isEmpty() ) {
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\"."); logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
continue; continue;
@ -521,10 +521,10 @@ public class FileUtils {
} }
public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID, public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String openAireId,
String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException
{ {
String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store. String filenameForS3 = constructS3FileName(fileName, openAireId, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store.
if ( filenameForS3 == null ) // The error is logged inside. if ( filenameForS3 == null ) // The error is logged inside.
return null; return null;