From fc258e2e2648bd1900321ca03c4a7c5de1dea3c9 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 7 Jun 2024 13:21:27 +0300 Subject: [PATCH] - Rename the "pdfUrlPrefix" config-field to "fulltextUrlPrefix", as it may point to different file-formats in the future. - Code polishing. --- .../components/BulkImport.java | 19 +++++++------------ .../services/BulkImportServiceImpl.java | 10 +++++----- src/main/resources/application.yml | 4 ++-- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/components/BulkImport.java b/src/main/java/eu/openaire/urls_controller/components/BulkImport.java index bc8e02f..115eb1a 100644 --- a/src/main/java/eu/openaire/urls_controller/components/BulkImport.java +++ b/src/main/java/eu/openaire/urls_controller/components/BulkImport.java @@ -67,7 +67,7 @@ public class BulkImport { public static class BulkImportSource { private String datasourceID; private String datasourcePrefix; - private String pdfUrlPrefix; + private String fulltextUrlPrefix; private String mimeType; private boolean isAuthoritative; @@ -91,12 +91,12 @@ public class BulkImport { this.datasourcePrefix = datasourcePrefix; } - public String getPdfUrlPrefix() { - return pdfUrlPrefix; + public String getFulltextUrlPrefix() { + return fulltextUrlPrefix; } - public void setPdfUrlPrefix(String pdfUrlPrefix) { - this.pdfUrlPrefix = pdfUrlPrefix; + public void setFulltextUrlPrefix(String fulltextUrlPrefix) { + this.fulltextUrlPrefix = fulltextUrlPrefix; } public String getMimeType() { @@ -117,13 +117,8 @@ public class BulkImport { @Override public String toString() { - return "BulkImportSource{" + - "datasourceID='" + datasourceID + '\'' + - ", datasourcePrefix='" + datasourcePrefix + '\'' + - ", pdfUrlPrefix='" + pdfUrlPrefix + '\'' + - ", mimeType='" + mimeType + '\'' + - ", isAuthoritative=" + isAuthoritative + - '}'; + return "BulkImportSource{" + "datasourceID='" + datasourceID + '\'' + ", datasourcePrefix='" + datasourcePrefix + '\'' + ", fulltextUrlPrefix='" + fulltextUrlPrefix + '\'' + ", mimeType='" + mimeType + '\'' + + ", isAuthoritative=" + isAuthoritative + '}'; } } diff --git a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java index 1c214a6..6565094 100644 --- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java @@ -265,7 +265,7 @@ public class BulkImportServiceImpl implements BulkImportService { HashMap hashWithExistingLocationMap = fileUtils.getHashLocationMap(fileHashes, fileHashesSetSize, segmentCounter, "segment"); int numAlreadyRetrievedFiles = hashWithExistingLocationMap.size(); if ( numAlreadyRetrievedFiles > 0 ) { - msg = numAlreadyRetrievedFiles + " files from segment_" + segmentCounter + ", have been already retrieved in the past."; + msg = numAlreadyRetrievedFiles + " files (out of " + numOfFilesInSegment + "), from segment_" + segmentCounter + ", have been already retrieved in the past."; logger.warn(msg + additionalLoggingMsg); bulkImportReport.addEvent(msg); fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true); @@ -413,10 +413,10 @@ public class BulkImportServiceImpl implements BulkImportService { return null; } - String fileNameID = fileLocationData.getFileNameID(); + String fileId = fileLocationData.getFileNameID(); // Note: This method not accept parentheses. If there is ever a publisher that uses parentheses, then we have to use another regex, than the one used for retrieved full-texts, from the Workers. - String openAireId = generateOpenaireId(fileNameID, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative()); - if ( openAireId == null ) + String openAireId = generateOpenaireId(fileId, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative()); + if ( openAireId == null ) // The error is logged inside. return null; String fileHash = docFileData.getHash(); // It's guaranteed to NOT be null at this point. @@ -438,7 +438,7 @@ public class BulkImportServiceImpl implements BulkImportService { } // TODO - If another url-schema is introduced for other datasources, have a "switch"-statement and perform the right "actualUrl"-creation based on current schema. - String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources. + String actualUrl = (bulkImportSource.getFulltextUrlPrefix() + fileId); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources. String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link. return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(), diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index f985b3b..cf42e0c 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -45,13 +45,13 @@ bulk-import: arxivImport: datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23 datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18") - pdfUrlPrefix: https://arxiv.org/pdf/ + fulltextUrlPrefix: https://arxiv.org/pdf/ mimeType: application/pdf isAuthoritative: true # otherImport: # datasourceID: othersource__::0123 # datasourcePrefix: other_______ -# pdfUrlPrefix: https://example.org/pdf/ +# fulltextUrlPrefix: https://example.org/pdf/ # mimeType: application/pdf # isAuthoritative: false