From 1d821ed80347b1fc4225874b77d242ca183c4a96 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 28 Mar 2024 06:09:28 +0200 Subject: [PATCH] - Prepare version for next release. - Fix typo of not using the "OpenAireID" in the S3 location of bulkImported files. Instead, the "fileNameID" was used, which in aggregation is the OpenAireID, but not in bulk-import. - Update dependencies. - Code polishing. --- build.gradle | 17 ++++++++++------- gradle/wrapper/gradle-wrapper.properties | 2 +- .../services/BulkImportServiceImpl.java | 7 +++---- .../urls_controller/util/FileUtils.java | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/build.gradle b/build.gradle index 39180a2..16c1995 100644 --- a/build.gradle +++ b/build.gradle @@ -6,7 +6,7 @@ plugins { java { group = 'eu.openaire.urls_controller' - version = '2.6.3-SNAPSHOT' + version = '2.7.0-SNAPSHOT' sourceCompatibility = JavaVersion.VERSION_1_8 } @@ -18,6 +18,10 @@ repositories { } } +ext { + hadoopVersion = '3.4.0' +} + dependencies { runtimeOnly "org.springframework.boot:spring-boot-devtools" @@ -76,7 +80,7 @@ dependencies { implementation('org.apache.parquet:parquet-avro:1.13.1') // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common - implementation('org.apache.hadoop:hadoop-common:3.3.6') { + implementation("org.apache.hadoop:hadoop-common:$hadoopVersion") { exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-api' @@ -92,7 +96,7 @@ dependencies { } // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core - implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') { + implementation("org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") { exclude group: 'org.apache.parquet', module: 'parquet-avro' exclude group: 'org.apache.avro', module: 'avro' exclude group: 'org.slf4j', module: 'slf4j-api' @@ -106,17 +110,16 @@ dependencies { // Add back some updated version of the needed dependencies. implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8. - implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1' + implementation 'com.fasterxml.woodstox:woodstox-core:6.6.2' // https://mvnrepository.com/artifact/org.json/json - implementation 'org.json:json:20240303' + implementation 'org.json:json:20240303' // This is used only in "ParquetFileUtils.createRemoteParquetDirectories()". TODO - Replace it with "gson". // https://mvnrepository.com/artifact/com.google.code.gson/gson implementation 'com.google.code.gson:gson:2.10.1' - // https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus - runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3' + runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.4' testImplementation 'org.springframework.security:spring-security-test' testImplementation "org.springframework.boot:spring-boot-starter-test" diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index a80b22c..b82aa23 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java index 45574d4..52f85f0 100644 --- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java @@ -270,7 +270,7 @@ public class BulkImportServiceImpl implements BulkImportService { failedFiles.add(fileLocation); } - if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and right it to the file. + if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and write it to the file. msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files."; if ( logger.isTraceEnabled() ) logger.trace(msg + additionalLoggingMsg); @@ -363,8 +363,7 @@ public class BulkImportServiceImpl implements BulkImportService { private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg) throws ConnectException, UnknownHostException { - File fullTextFile = new File(fileLocation); - DocFileData docFileData = new DocFileData(fullTextFile, null, null, null); + DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null); docFileData.calculateAndSetHashAndSize(); // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded. @@ -421,7 +420,7 @@ public class BulkImportServiceImpl implements BulkImportService { s3Url = alreadyFoundFileLocation; } else - s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash); + s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash); return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(), docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null. diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 395bc73..ce30a4c 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -466,7 +466,7 @@ public class FileUtils { } // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used. // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName). - String fileNameID = matcher.group(4); + String fileNameID = matcher.group(4); // The "fileNameID" is the OpenAIRE_ID for this file. if ( (fileNameID == null) || fileNameID.isEmpty() ) { logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\"."); continue; @@ -521,10 +521,10 @@ public class FileUtils { } - public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID, + public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String openAireId, String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException { - String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store. + String filenameForS3 = constructS3FileName(fileName, openAireId, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store. if ( filenameForS3 == null ) // The error is logged inside. return null;