You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
UrlsWorker/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin...

229 lines
13 KiB
Java

package eu.openaire.urls_worker.plugins;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.UrlsWorkerApplication;
import eu.openaire.urls_worker.components.ScheduledTasks;
import eu.openaire.urls_worker.models.Assignment;
import eu.openaire.urls_worker.models.Error;
import eu.openaire.urls_worker.models.Payload;
import eu.openaire.urls_worker.models.UrlReport;
import eu.openaire.urls_worker.services.FileStorageService;
import eu.openaire.urls_worker.util.AssignmentsHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
public class PublicationsRetrieverPlugin {
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
public static String assignmentsBasePath;
public PublicationsRetrieverPlugin() {
// Specify some configurations
LoaderAndChecker.retrieveDocuments = true;
LoaderAndChecker.retrieveDatasets = false;
ConnSupportUtils.setKnownMimeTypes();
FileUtils.shouldDownloadDocFiles = true;
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
PublicationsRetriever.targetUrlType = "docUrl";
FileUtils.jsonBatchSize = UrlsWorkerApplication.maxAssignmentsLimitPerBatch;
assignmentsBasePath = FileStorageService.assignmentsLocation.toString();
if ( !assignmentsBasePath.endsWith(File.separator) )
assignmentsBasePath += File.separator;
ConnSupportUtils.shouldBlockMost5XXDomains = false;
LoaderAndChecker.setCouldRetryRegex();
PublicationsRetriever.threadsMultiplier = 4;
int workerThreadsCount = Runtime.getRuntime().availableProcessors() * PublicationsRetriever.threadsMultiplier;
logger.info("Use " + workerThreadsCount + " worker-threads.");
PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
}
private static final List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);
public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException
{
FileUtils.storeDocFilesDir = assignmentsBasePath + "assignments_" + assignmentRequestCounter + "_fullTexts" + File.separator; // It needs the last separator, because of how the docFiles are named and stored.
File curAssignmentsDirs = new File(FileUtils.storeDocFilesDir);
try {
if ( !curAssignmentsDirs.exists() ) {
if ( !curAssignmentsDirs.mkdirs() ) { // Create the directories.
String workingDir = System.getProperty("user.dir") + File.separator;
logger.error("Could not create the \"assignments_fullTexts directories\": \"" + FileUtils.storeDocFilesDir + "\". Using the \"workingDir\" instead (" + workingDir + ").");
FileUtils.storeDocFilesDir = assignmentsBasePath = workingDir;
}
}
} catch (Exception e) {
String errorMsg = "Failed to create the full-texts directory for assignments_" + assignmentRequestCounter;
logger.error(errorMsg, e);
throw new RuntimeException(errorMsg + ": " + e.getMessage());
}
// Start loading and checking urls.
for ( Assignment assignment : assignments )
{
callableTasks.add(() -> {
String id = assignment.getId();
String url = assignment.getOriginalUrl();
if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null ) {
return false;
} // The "url" might have changed (inside "handleUrlChecks()").
String urlToCheck = url;
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) { // If we got into an already-found docUrl, log it and return.
ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, true);
return true;
}
boolean isPossibleDocOrDatasetUrl = false; // Used for specific connection settings.
String lowerCaseRetrievedUrl = url.toLowerCase();
// Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.
if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())
|| (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {
//logger.debug("Possible docUrl or datasetUrl: " + url);
isPossibleDocOrDatasetUrl = true;
}
try { // Check if it's a docUrl, if not, it gets crawled.
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
} catch (Exception e) {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
return true;
});
}
int numFailedTasks = LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
if ( numFailedTasks == -1 ) { // The unknown exception is logged inside the above method.
System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!");
UrlsWorkerApplication.gentleAppShutdown();
}
if ( numFailedTasks > 0 )
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
addUrlReportsToWorkerReport();
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
}
public static void addUrlReportsToWorkerReport()
{
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
{
UrlReport.StatusType status = null;
String fileLocation = null, comment = data.getComment(), mimeType = null, hash = data.getHash();
Long size = data.getSize();
Error error = null;
if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE.
{
status = UrlReport.StatusType.accessible;
if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) {
// The file of this docUrl was already downloaded by another docUrl.
int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message".
String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl);
//logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl); // DEBUG!
// Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))
&& ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) {
fileLocation = data_2.getComment();
size = data_2.getSize();
hash = data_2.getHash();
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
break;
}
}
// In case the "alreadyDownloaded" full-text is not found, we have an error.
error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future.
}
else if ( ! comment.equals(HttpConnUtils.docFileNotRetrievedMessage) ) { // If it was downloaded without an error.
fileLocation = comment; // This is the full-file-path.
mimeType = "application/pdf";
} else // Else the file was not retrieved, so all file-related data are kept "null".
error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future.
if ( error == null ) // If the file was retrieved, in any time.
error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" Error-object, since it just adds more complicated handling in the controller..
}
else {
status = UrlReport.StatusType.non_accessible;
if ( "true".equals(data.getCouldRetry()) )
error = new Error(Error.ErrorType.couldRetry, comment);
else
error = new Error(Error.ErrorType.noRetry, comment);
}
String docOrDatasetUrl = data.getDocOrDatasetUrl();
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
docOrDatasetUrl = null;
// Convert "null" strings to actual < null >
if ( (hash != null) && (hash.equals("null")) )
hash = null;
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
}// end-for
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
}
public static boolean connectWithUrlTest(String urlToCheck) {
String testID = "testID";
try {
return HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(testID, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
}
}