- Rename the "pdfUrlPrefix" config-field to "fulltextUrlPrefix", as it may point to different file-formats in the future.

- Code polishing.
This commit is contained in:
Lampros Smyrnaios 2024-06-07 13:21:27 +03:00
parent ed7bf09f9b
commit fc258e2e26
3 changed files with 14 additions and 19 deletions

View File

@ -67,7 +67,7 @@ public class BulkImport {
public static class BulkImportSource {
private String datasourceID;
private String datasourcePrefix;
private String pdfUrlPrefix;
private String fulltextUrlPrefix;
private String mimeType;
private boolean isAuthoritative;
@ -91,12 +91,12 @@ public class BulkImport {
this.datasourcePrefix = datasourcePrefix;
}
public String getPdfUrlPrefix() {
return pdfUrlPrefix;
public String getFulltextUrlPrefix() {
return fulltextUrlPrefix;
}
public void setPdfUrlPrefix(String pdfUrlPrefix) {
this.pdfUrlPrefix = pdfUrlPrefix;
public void setFulltextUrlPrefix(String fulltextUrlPrefix) {
this.fulltextUrlPrefix = fulltextUrlPrefix;
}
public String getMimeType() {
@ -117,13 +117,8 @@ public class BulkImport {
@Override
public String toString() {
return "BulkImportSource{" +
"datasourceID='" + datasourceID + '\'' +
", datasourcePrefix='" + datasourcePrefix + '\'' +
", pdfUrlPrefix='" + pdfUrlPrefix + '\'' +
", mimeType='" + mimeType + '\'' +
", isAuthoritative=" + isAuthoritative +
'}';
return "BulkImportSource{" + "datasourceID='" + datasourceID + '\'' + ", datasourcePrefix='" + datasourcePrefix + '\'' + ", fulltextUrlPrefix='" + fulltextUrlPrefix + '\'' + ", mimeType='" + mimeType + '\'' +
", isAuthoritative=" + isAuthoritative + '}';
}
}

View File

@ -265,7 +265,7 @@ public class BulkImportServiceImpl implements BulkImportService {
HashMap<String, String> hashWithExistingLocationMap = fileUtils.getHashLocationMap(fileHashes, fileHashesSetSize, segmentCounter, "segment");
int numAlreadyRetrievedFiles = hashWithExistingLocationMap.size();
if ( numAlreadyRetrievedFiles > 0 ) {
msg = numAlreadyRetrievedFiles + " files from segment_" + segmentCounter + ", have been already retrieved in the past.";
msg = numAlreadyRetrievedFiles + " files (out of " + numOfFilesInSegment + "), from segment_" + segmentCounter + ", have been already retrieved in the past.";
logger.warn(msg + additionalLoggingMsg);
bulkImportReport.addEvent(msg);
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
@ -413,10 +413,10 @@ public class BulkImportServiceImpl implements BulkImportService {
return null;
}
String fileNameID = fileLocationData.getFileNameID();
String fileId = fileLocationData.getFileNameID(); // Note: This method not accept parentheses. If there is ever a publisher that uses parentheses, then we have to use another regex, than the one used for retrieved full-texts, from the Workers.
String openAireId = generateOpenaireId(fileNameID, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
if ( openAireId == null )
String openAireId = generateOpenaireId(fileId, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
if ( openAireId == null ) // The error is logged inside.
return null;
String fileHash = docFileData.getHash(); // It's guaranteed to NOT be null at this point.
@ -438,7 +438,7 @@ public class BulkImportServiceImpl implements BulkImportService {
}
// TODO - If another url-schema is introduced for other datasources, have a "switch"-statement and perform the right "actualUrl"-creation based on current schema.
String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
String actualUrl = (bulkImportSource.getFulltextUrlPrefix() + fileId); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),

View File

@ -45,13 +45,13 @@ bulk-import:
arxivImport:
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
pdfUrlPrefix: https://arxiv.org/pdf/
fulltextUrlPrefix: https://arxiv.org/pdf/
mimeType: application/pdf
isAuthoritative: true
# otherImport:
# datasourceID: othersource__::0123
# datasourcePrefix: other_______
# pdfUrlPrefix: https://example.org/pdf/
# fulltextUrlPrefix: https://example.org/pdf/
# mimeType: application/pdf
# isAuthoritative: false