- Prepare version for next release.

- Fix typo of not using the "OpenAireID" in the S3 location of bulkImported files. Instead, the "fileNameID" was used, which in aggregation is the OpenAireID, but not in bulk-import. - Update dependencies. - Code polishing.
2024-03-28 06:09:28 +02:00 · 2024-03-28 06:09:28 +02:00 · 1d821ed803
parent 8bc5cc35e2
commit 1d821ed803
4 changed files with 17 additions and 15 deletions
--- a/build.gradle
+++ b/build.gradle
@ -6,7 +6,7 @@ plugins {

 java {
    group = 'eu.openaire.urls_controller'
-    version = '2.6.3-SNAPSHOT'
+    version = '2.7.0-SNAPSHOT'
    sourceCompatibility = JavaVersion.VERSION_1_8
 }

@ -18,6 +18,10 @@ repositories {
    }
 }

+ext {
+    hadoopVersion = '3.4.0'
+}
+
 dependencies {
    runtimeOnly "org.springframework.boot:spring-boot-devtools"

@ -76,7 +80,7 @@ dependencies {
    implementation('org.apache.parquet:parquet-avro:1.13.1')

    // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
-    implementation('org.apache.hadoop:hadoop-common:3.3.6') {
+    implementation("org.apache.hadoop:hadoop-common:$hadoopVersion") {
        exclude group: 'org.apache.parquet', module: 'parquet-avro'
        exclude group: 'org.apache.avro', module: 'avro'
        exclude group: 'org.slf4j', module: 'slf4j-api'
@ -92,7 +96,7 @@ dependencies {
    }

    // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core
-    implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') {
+    implementation("org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") {
        exclude group: 'org.apache.parquet', module: 'parquet-avro'
        exclude group: 'org.apache.avro', module: 'avro'
        exclude group: 'org.slf4j', module: 'slf4j-api'
@ -106,17 +110,16 @@ dependencies {

    // Add back some updated version of the needed dependencies.
    implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
-    implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1'
+    implementation 'com.fasterxml.woodstox:woodstox-core:6.6.2'

    // https://mvnrepository.com/artifact/org.json/json
-    implementation 'org.json:json:20240303'
+    implementation 'org.json:json:20240303' // This is used only in "ParquetFileUtils.createRemoteParquetDirectories()". TODO - Replace it with "gson".

    // https://mvnrepository.com/artifact/com.google.code.gson/gson
    implementation 'com.google.code.gson:gson:2.10.1'

-
    // https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
-    runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3'
+    runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.4'

    testImplementation 'org.springframework.security:spring-security-test'
    testImplementation "org.springframework.boot:spring-boot-starter-test"
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@ -1,6 +1,6 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
 networkTimeout=10000
 validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME
--- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
@ -270,7 +270,7 @@ public class BulkImportServiceImpl implements BulkImportService {
                failedFiles.add(fileLocation);
            }

-            if ( ((++counter) % 150) == 0 ) {   // Every 150 files, report the status for this segment and right it to the file.
+            if ( ((++counter) % 150) == 0 ) {   // Every 150 files, report the status for this segment and write it to the file.
                msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
                if ( logger.isTraceEnabled() )
                    logger.trace(msg + additionalLoggingMsg);
@ -363,8 +363,7 @@ public class BulkImportServiceImpl implements BulkImportService {
    private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
            throws ConnectException, UnknownHostException
    {
-        File fullTextFile = new File(fileLocation);
-        DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
+        DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
        docFileData.calculateAndSetHashAndSize();

        // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
@ -421,7 +420,7 @@ public class BulkImportServiceImpl implements BulkImportService {

            s3Url = alreadyFoundFileLocation;
        } else
-            s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
+            s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);

        return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
                docFileData.getSize(), fileHash, s3Url, provenance, true);  // It may return null.
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -466,7 +466,7 @@ public class FileUtils {
            }
            // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
            // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
-            String fileNameID = matcher.group(4);
+            String fileNameID = matcher.group(4);   // The "fileNameID" is the OpenAIRE_ID for this file.
            if ( (fileNameID == null) || fileNameID.isEmpty() ) {
                logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
                continue;
@ -521,10 +521,10 @@ public class FileUtils {
    }


-    public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID,
+    public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String openAireId,
                                                   String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException
    {
-        String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash);   // This name is for the uploaded file, in the S3 Object Store.
+        String filenameForS3 = constructS3FileName(fileName, openAireId, dotFileExtension, datasourceId, hash);   // This name is for the uploaded file, in the S3 Object Store.
        if ( filenameForS3 == null )    // The error is logged inside.
            return null;