- Rename the "pdfUrlPrefix" config-field to "fulltextUrlPrefix", as it may point to different file-formats in the future.

- Code polishing.
2024-06-07 13:21:27 +03:00 · 2024-06-07 13:21:27 +03:00 · fc258e2e26
parent ed7bf09f9b
commit fc258e2e26
3 changed files with 14 additions and 19 deletions
--- a/src/main/java/eu/openaire/urls_controller/components/BulkImport.java
+++ b/src/main/java/eu/openaire/urls_controller/components/BulkImport.java
@ -67,7 +67,7 @@ public class BulkImport {
    public static class BulkImportSource {
        private String datasourceID;
        private String datasourcePrefix;
-        private String pdfUrlPrefix;
+        private String fulltextUrlPrefix;
        private String mimeType;
        private boolean isAuthoritative;

@ -91,12 +91,12 @@ public class BulkImport {
            this.datasourcePrefix = datasourcePrefix;
        }

-        public String getPdfUrlPrefix() {
-            return pdfUrlPrefix;
+        public String getFulltextUrlPrefix() {
+            return fulltextUrlPrefix;
        }

-        public void setPdfUrlPrefix(String pdfUrlPrefix) {
-            this.pdfUrlPrefix = pdfUrlPrefix;
+        public void setFulltextUrlPrefix(String fulltextUrlPrefix) {
+            this.fulltextUrlPrefix = fulltextUrlPrefix;
        }

        public String getMimeType() {
@ -117,13 +117,8 @@ public class BulkImport {

        @Override
        public String toString() {
-            return "BulkImportSource{" +
-                    "datasourceID='" + datasourceID + '\'' +
-                    ", datasourcePrefix='" + datasourcePrefix + '\'' +
-                    ", pdfUrlPrefix='" + pdfUrlPrefix + '\'' +
-                    ", mimeType='" + mimeType + '\'' +
-                    ", isAuthoritative=" + isAuthoritative +
-                    '}';
+            return "BulkImportSource{" + "datasourceID='" + datasourceID + '\'' + ", datasourcePrefix='" + datasourcePrefix + '\'' + ", fulltextUrlPrefix='" + fulltextUrlPrefix + '\'' + ", mimeType='" + mimeType + '\'' +
+                    ", isAuthoritative=" + isAuthoritative + '}';
        }
    }

--- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
@ -265,7 +265,7 @@ public class BulkImportServiceImpl implements BulkImportService {
        HashMap<String, String> hashWithExistingLocationMap = fileUtils.getHashLocationMap(fileHashes, fileHashesSetSize, segmentCounter, "segment");
        int numAlreadyRetrievedFiles = hashWithExistingLocationMap.size();
        if ( numAlreadyRetrievedFiles > 0 ) {
-            msg = numAlreadyRetrievedFiles + " files from segment_" + segmentCounter + ", have been already retrieved in the past.";
+            msg = numAlreadyRetrievedFiles + " files (out of " + numOfFilesInSegment + "), from segment_" + segmentCounter + ", have been already retrieved in the past.";
            logger.warn(msg + additionalLoggingMsg);
            bulkImportReport.addEvent(msg);
            fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
@ -413,10 +413,10 @@ public class BulkImportServiceImpl implements BulkImportService {
            return null;
        }

-        String fileNameID = fileLocationData.getFileNameID();
+        String fileId = fileLocationData.getFileNameID();   // Note: This method not accept parentheses. If there is ever a publisher that uses parentheses, then we have to use another regex, than the one used for retrieved full-texts, from the Workers.

-        String openAireId = generateOpenaireId(fileNameID, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
-        if ( openAireId == null )
+        String openAireId = generateOpenaireId(fileId, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
+        if ( openAireId == null )    // The error is logged inside.
            return null;

        String fileHash = docFileData.getHash();    // It's guaranteed to NOT be null at this point.
@ -438,7 +438,7 @@ public class BulkImportServiceImpl implements BulkImportService {
        }

        // TODO - If another url-schema is introduced for other datasources, have a "switch"-statement and perform the right "actualUrl"-creation based on current schema.
-        String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
+        String actualUrl = (bulkImportSource.getFulltextUrlPrefix() + fileId); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
        String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.

        return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@ -45,13 +45,13 @@ bulk-import:
        arxivImport:
            datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
            datasourcePrefix: arXiv_______  # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
-            pdfUrlPrefix: https://arxiv.org/pdf/
+            fulltextUrlPrefix: https://arxiv.org/pdf/
            mimeType: application/pdf
            isAuthoritative: true
 #        otherImport:
 #            datasourceID: othersource__::0123
 #            datasourcePrefix: other_______
-#            pdfUrlPrefix: https://example.org/pdf/
+#            fulltextUrlPrefix: https://example.org/pdf/
 #            mimeType: application/pdf
 #            isAuthoritative: false