forked from lsmyrnaios/UrlsController
- Rename the "pdfUrlPrefix" config-field to "fulltextUrlPrefix", as it may point to different file-formats in the future.
- Code polishing.
This commit is contained in:
parent
ed7bf09f9b
commit
fc258e2e26
|
@ -67,7 +67,7 @@ public class BulkImport {
|
|||
public static class BulkImportSource {
|
||||
private String datasourceID;
|
||||
private String datasourcePrefix;
|
||||
private String pdfUrlPrefix;
|
||||
private String fulltextUrlPrefix;
|
||||
private String mimeType;
|
||||
private boolean isAuthoritative;
|
||||
|
||||
|
@ -91,12 +91,12 @@ public class BulkImport {
|
|||
this.datasourcePrefix = datasourcePrefix;
|
||||
}
|
||||
|
||||
public String getPdfUrlPrefix() {
|
||||
return pdfUrlPrefix;
|
||||
public String getFulltextUrlPrefix() {
|
||||
return fulltextUrlPrefix;
|
||||
}
|
||||
|
||||
public void setPdfUrlPrefix(String pdfUrlPrefix) {
|
||||
this.pdfUrlPrefix = pdfUrlPrefix;
|
||||
public void setFulltextUrlPrefix(String fulltextUrlPrefix) {
|
||||
this.fulltextUrlPrefix = fulltextUrlPrefix;
|
||||
}
|
||||
|
||||
public String getMimeType() {
|
||||
|
@ -117,13 +117,8 @@ public class BulkImport {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BulkImportSource{" +
|
||||
"datasourceID='" + datasourceID + '\'' +
|
||||
", datasourcePrefix='" + datasourcePrefix + '\'' +
|
||||
", pdfUrlPrefix='" + pdfUrlPrefix + '\'' +
|
||||
", mimeType='" + mimeType + '\'' +
|
||||
", isAuthoritative=" + isAuthoritative +
|
||||
'}';
|
||||
return "BulkImportSource{" + "datasourceID='" + datasourceID + '\'' + ", datasourcePrefix='" + datasourcePrefix + '\'' + ", fulltextUrlPrefix='" + fulltextUrlPrefix + '\'' + ", mimeType='" + mimeType + '\'' +
|
||||
", isAuthoritative=" + isAuthoritative + '}';
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -265,7 +265,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
|||
HashMap<String, String> hashWithExistingLocationMap = fileUtils.getHashLocationMap(fileHashes, fileHashesSetSize, segmentCounter, "segment");
|
||||
int numAlreadyRetrievedFiles = hashWithExistingLocationMap.size();
|
||||
if ( numAlreadyRetrievedFiles > 0 ) {
|
||||
msg = numAlreadyRetrievedFiles + " files from segment_" + segmentCounter + ", have been already retrieved in the past.";
|
||||
msg = numAlreadyRetrievedFiles + " files (out of " + numOfFilesInSegment + "), from segment_" + segmentCounter + ", have been already retrieved in the past.";
|
||||
logger.warn(msg + additionalLoggingMsg);
|
||||
bulkImportReport.addEvent(msg);
|
||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||
|
@ -413,10 +413,10 @@ public class BulkImportServiceImpl implements BulkImportService {
|
|||
return null;
|
||||
}
|
||||
|
||||
String fileNameID = fileLocationData.getFileNameID();
|
||||
String fileId = fileLocationData.getFileNameID(); // Note: This method not accept parentheses. If there is ever a publisher that uses parentheses, then we have to use another regex, than the one used for retrieved full-texts, from the Workers.
|
||||
|
||||
String openAireId = generateOpenaireId(fileNameID, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
|
||||
if ( openAireId == null )
|
||||
String openAireId = generateOpenaireId(fileId, bulkImportSource.getDatasourcePrefix(), bulkImportSource.getIsAuthoritative());
|
||||
if ( openAireId == null ) // The error is logged inside.
|
||||
return null;
|
||||
|
||||
String fileHash = docFileData.getHash(); // It's guaranteed to NOT be null at this point.
|
||||
|
@ -438,7 +438,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
|||
}
|
||||
|
||||
// TODO - If another url-schema is introduced for other datasources, have a "switch"-statement and perform the right "actualUrl"-creation based on current schema.
|
||||
String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
|
||||
String actualUrl = (bulkImportSource.getFulltextUrlPrefix() + fileId); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
|
||||
String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.
|
||||
|
||||
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
|
||||
|
|
|
@ -45,13 +45,13 @@ bulk-import:
|
|||
arxivImport:
|
||||
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
||||
datasourcePrefix: arXiv_______ # For PID-providing datasource, we use the PID-prefix here. (so not the datasource-prefix: "od________18")
|
||||
pdfUrlPrefix: https://arxiv.org/pdf/
|
||||
fulltextUrlPrefix: https://arxiv.org/pdf/
|
||||
mimeType: application/pdf
|
||||
isAuthoritative: true
|
||||
# otherImport:
|
||||
# datasourceID: othersource__::0123
|
||||
# datasourcePrefix: other_______
|
||||
# pdfUrlPrefix: https://example.org/pdf/
|
||||
# fulltextUrlPrefix: https://example.org/pdf/
|
||||
# mimeType: application/pdf
|
||||
# isAuthoritative: false
|
||||
|
||||
|
|
Loading…
Reference in New Issue