- Prepare version for next release.
- Fix typo of not using the "OpenAireID" in the S3 location of bulkImported files. Instead, the "fileNameID" was used, which in aggregation is the OpenAireID, but not in bulk-import. - Update dependencies. - Code polishing.
This commit is contained in:
parent
8bc5cc35e2
commit
1d821ed803
17
build.gradle
17
build.gradle
|
@ -6,7 +6,7 @@ plugins {
|
||||||
|
|
||||||
java {
|
java {
|
||||||
group = 'eu.openaire.urls_controller'
|
group = 'eu.openaire.urls_controller'
|
||||||
version = '2.6.3-SNAPSHOT'
|
version = '2.7.0-SNAPSHOT'
|
||||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,10 @@ repositories {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ext {
|
||||||
|
hadoopVersion = '3.4.0'
|
||||||
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
runtimeOnly "org.springframework.boot:spring-boot-devtools"
|
runtimeOnly "org.springframework.boot:spring-boot-devtools"
|
||||||
|
|
||||||
|
@ -76,7 +80,7 @@ dependencies {
|
||||||
implementation('org.apache.parquet:parquet-avro:1.13.1')
|
implementation('org.apache.parquet:parquet-avro:1.13.1')
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
|
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
|
||||||
implementation('org.apache.hadoop:hadoop-common:3.3.6') {
|
implementation("org.apache.hadoop:hadoop-common:$hadoopVersion") {
|
||||||
exclude group: 'org.apache.parquet', module: 'parquet-avro'
|
exclude group: 'org.apache.parquet', module: 'parquet-avro'
|
||||||
exclude group: 'org.apache.avro', module: 'avro'
|
exclude group: 'org.apache.avro', module: 'avro'
|
||||||
exclude group: 'org.slf4j', module: 'slf4j-api'
|
exclude group: 'org.slf4j', module: 'slf4j-api'
|
||||||
|
@ -92,7 +96,7 @@ dependencies {
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core
|
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core
|
||||||
implementation('org.apache.hadoop:hadoop-mapreduce-client-core:3.3.6') {
|
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") {
|
||||||
exclude group: 'org.apache.parquet', module: 'parquet-avro'
|
exclude group: 'org.apache.parquet', module: 'parquet-avro'
|
||||||
exclude group: 'org.apache.avro', module: 'avro'
|
exclude group: 'org.apache.avro', module: 'avro'
|
||||||
exclude group: 'org.slf4j', module: 'slf4j-api'
|
exclude group: 'org.slf4j', module: 'slf4j-api'
|
||||||
|
@ -106,17 +110,16 @@ dependencies {
|
||||||
|
|
||||||
// Add back some updated version of the needed dependencies.
|
// Add back some updated version of the needed dependencies.
|
||||||
implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
|
implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
|
||||||
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1'
|
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.2'
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/org.json/json
|
// https://mvnrepository.com/artifact/org.json/json
|
||||||
implementation 'org.json:json:20240303'
|
implementation 'org.json:json:20240303' // This is used only in "ParquetFileUtils.createRemoteParquetDirectories()". TODO - Replace it with "gson".
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
||||||
implementation 'com.google.code.gson:gson:2.10.1'
|
implementation 'com.google.code.gson:gson:2.10.1'
|
||||||
|
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
|
// https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
|
||||||
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3'
|
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.4'
|
||||||
|
|
||||||
testImplementation 'org.springframework.security:spring-security-test'
|
testImplementation 'org.springframework.security:spring-security-test'
|
||||||
testImplementation "org.springframework.boot:spring-boot-starter-test"
|
testImplementation "org.springframework.boot:spring-boot-starter-test"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
|
||||||
networkTimeout=10000
|
networkTimeout=10000
|
||||||
validateDistributionUrl=true
|
validateDistributionUrl=true
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
|
|
|
@ -270,7 +270,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
failedFiles.add(fileLocation);
|
failedFiles.add(fileLocation);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and right it to the file.
|
if ( ((++counter) % 150) == 0 ) { // Every 150 files, report the status for this segment and write it to the file.
|
||||||
msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
|
msg = "Progress for segment-" + segmentCounter + " : " + payloadRecords.size() + " files have been imported and " + failedFiles.size() + " have failed, out of " + numOfFilesInSegment + " files.";
|
||||||
if ( logger.isTraceEnabled() )
|
if ( logger.isTraceEnabled() )
|
||||||
logger.trace(msg + additionalLoggingMsg);
|
logger.trace(msg + additionalLoggingMsg);
|
||||||
|
@ -363,8 +363,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
|
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
|
||||||
throws ConnectException, UnknownHostException
|
throws ConnectException, UnknownHostException
|
||||||
{
|
{
|
||||||
File fullTextFile = new File(fileLocation);
|
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
|
||||||
DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
|
|
||||||
docFileData.calculateAndSetHashAndSize();
|
docFileData.calculateAndSetHashAndSize();
|
||||||
|
|
||||||
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
|
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
|
||||||
|
@ -421,7 +420,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
|
|
||||||
s3Url = alreadyFoundFileLocation;
|
s3Url = alreadyFoundFileLocation;
|
||||||
} else
|
} else
|
||||||
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
|
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
|
||||||
|
|
||||||
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
|
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
|
||||||
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.
|
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.
|
||||||
|
|
|
@ -466,7 +466,7 @@ public class FileUtils {
|
||||||
}
|
}
|
||||||
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
|
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
|
||||||
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
|
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
|
||||||
String fileNameID = matcher.group(4);
|
String fileNameID = matcher.group(4); // The "fileNameID" is the OpenAIRE_ID for this file.
|
||||||
if ( (fileNameID == null) || fileNameID.isEmpty() ) {
|
if ( (fileNameID == null) || fileNameID.isEmpty() ) {
|
||||||
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
|
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
|
||||||
continue;
|
continue;
|
||||||
|
@ -521,10 +521,10 @@ public class FileUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID,
|
public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String openAireId,
|
||||||
String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException
|
String dotFileExtension, String datasourceId, String hash) throws ConnectException, UnknownHostException
|
||||||
{
|
{
|
||||||
String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store.
|
String filenameForS3 = constructS3FileName(fileName, openAireId, dotFileExtension, datasourceId, hash); // This name is for the uploaded file, in the S3 Object Store.
|
||||||
if ( filenameForS3 == null ) // The error is logged inside.
|
if ( filenameForS3 == null ) // The error is logged inside.
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue