197 lines
10 KiB
Java
197 lines
10 KiB
Java
|
package eu.openaire.urls_worker.plugins.publicationsRetriever;
|
||
|
|
||
|
import com.google.common.hash.Hashing;
|
||
|
import com.google.common.io.Files;
|
||
|
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||
|
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||
|
import eu.openaire.publications_retriever.util.file.FileUtils;
|
||
|
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||
|
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
|
||
|
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
||
|
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||
|
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||
|
import eu.openaire.urls_worker.models.Payload;
|
||
|
import eu.openaire.urls_worker.models.Task;
|
||
|
import eu.openaire.urls_worker.models.UrlReport;
|
||
|
import eu.openaire.urls_worker.util.AssignmentHandler;
|
||
|
import org.slf4j.Logger;
|
||
|
import org.slf4j.LoggerFactory;
|
||
|
|
||
|
import java.io.File;
|
||
|
import java.io.FileNotFoundException;
|
||
|
import java.io.FileOutputStream;
|
||
|
import java.nio.charset.StandardCharsets;
|
||
|
import java.nio.file.Paths;
|
||
|
import java.util.*;
|
||
|
import java.util.concurrent.Callable;
|
||
|
import java.util.concurrent.Executors;
|
||
|
|
||
|
|
||
|
public class PublicationsRetrieverPlugin {
|
||
|
|
||
|
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
|
||
|
|
||
|
private static final String workingDir = System.getProperty("user.dir") + File.separator;
|
||
|
private static String assignmentsBasePath = workingDir + "assignments" + File.separator;
|
||
|
private static String assignmentsBaseFullTextsPath = assignmentsBasePath + "fullTexts" + File.separator;
|
||
|
|
||
|
static {
|
||
|
File assignmentsDir = new File(assignmentsBaseFullTextsPath);
|
||
|
if ( !assignmentsDir.exists() ) {
|
||
|
if ( !assignmentsDir.mkdirs() ) { // Create the directory.
|
||
|
logger.error("Could not create the \"assignments directories\": \"" + assignmentsBaseFullTextsPath + "\". Using the \"workingDir\" instead (" + workingDir + ").");
|
||
|
assignmentsBasePath = workingDir;
|
||
|
assignmentsBaseFullTextsPath = assignmentsBasePath;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Specify some configurations
|
||
|
LoaderAndChecker.retrieveDocuments = true;
|
||
|
LoaderAndChecker.retrieveDatasets = false;
|
||
|
FileUtils.shouldDownloadDocFiles = true;
|
||
|
PublicationsRetriever.targetUrlType = "docUrl";
|
||
|
|
||
|
int workerThreadsCount = Runtime.getRuntime().availableProcessors() * PublicationsRetriever.threadsMultiplier;
|
||
|
logger.info("Use " + workerThreadsCount + " worker-threads.");
|
||
|
PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
|
||
|
}
|
||
|
|
||
|
|
||
|
public static void processTasks(int assignmentId, Collection<Task> tasks) throws RuntimeException, FileNotFoundException
|
||
|
{
|
||
|
ConnSupportUtils.setKnownMimeTypes();
|
||
|
|
||
|
FileUtils.storeDocFilesDir = assignmentsBaseFullTextsPath + "assignment_" + assignmentId + "_fullTexts" + File.separator; // It needs the last separator, because of how the docFiles are named and stored.
|
||
|
|
||
|
FileUtils.setOutput(new FileOutputStream(assignmentsBasePath + "assignment_" + assignmentId + "_generic_results.json"));
|
||
|
|
||
|
int tasksSize = tasks.size();
|
||
|
int batchCount = 0;
|
||
|
int taskCount = 0;
|
||
|
List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);
|
||
|
|
||
|
// Start loading and checking urls.
|
||
|
for ( Task task : tasks )
|
||
|
{
|
||
|
callableTasks.add(() -> {
|
||
|
String id = task.getId();
|
||
|
String url = task.getUrl();
|
||
|
|
||
|
if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null ) {
|
||
|
return false;
|
||
|
} // The "url" might have changed (inside "handleUrlChecks()").
|
||
|
|
||
|
String urlToCheck = url;
|
||
|
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||
|
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
|
||
|
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||
|
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false");
|
||
|
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) { // If we got into an already-found docUrl, log it and return.
|
||
|
ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, logger, true);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
boolean isPossibleDocOrDatasetUrl = false; // Used for specific connection settings.
|
||
|
String lowerCaseRetrievedUrl = url.toLowerCase();
|
||
|
// Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.
|
||
|
if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())
|
||
|
|| (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {
|
||
|
//logger.debug("Possible docUrl or datasetUrl: " + url);
|
||
|
isPossibleDocOrDatasetUrl = true;
|
||
|
}
|
||
|
|
||
|
try { // Check if it's a docUrl, if not, it gets crawled.
|
||
|
HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
|
||
|
} catch (Exception e) {
|
||
|
String wasUrlValid = "true";
|
||
|
if ( e instanceof RuntimeException ) {
|
||
|
String message = e.getMessage();
|
||
|
if ( (message != null) && message.contains("HTTP 404 Client Error") )
|
||
|
wasUrlValid = "false";
|
||
|
}
|
||
|
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false");
|
||
|
}
|
||
|
return true;
|
||
|
});
|
||
|
|
||
|
if ( ((++taskCount) >= FileUtils.jsonBatchSize) || (taskCount >= tasksSize) )
|
||
|
{
|
||
|
logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
|
||
|
LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
|
||
|
addUrlReportsToWorkerReport();
|
||
|
callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
|
||
|
}
|
||
|
}// end tasks-for-loop
|
||
|
}
|
||
|
|
||
|
|
||
|
public static void addUrlReportsToWorkerReport()
|
||
|
{
|
||
|
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||
|
{
|
||
|
String status = null, fileLocation = null, md5 = null;
|
||
|
Long size = null;
|
||
|
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
|
||
|
{
|
||
|
status = "accessible";
|
||
|
fileLocation = data.getComment();
|
||
|
if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
|
||
|
// The file of this docUrl was already downloaded by another docUrl.
|
||
|
String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
|
||
|
//logger.debug("previousId: " + previousId); // DEBUG!
|
||
|
// Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
|
||
|
for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
|
||
|
if ( data_2.getUrlId().equals(previousId) && data_2.getWasDocumentOrDatasetAccessible().equals("true") ) {
|
||
|
fileLocation = data_2.getComment();
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else if ( fileLocation.contains("DocFileNotRetrievedException") )
|
||
|
fileLocation = null;
|
||
|
|
||
|
if ( fileLocation != null ) {
|
||
|
try {
|
||
|
File docFile = new File(fileLocation);
|
||
|
if ( docFile.isFile() ) {
|
||
|
md5 = Files.hash(docFile, Hashing.md5()).toString(); // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
|
||
|
//logger.debug("MD5 for file \"" + docFile.getName() + "\": " + md5); // DEBUG!
|
||
|
size = java.nio.file.Files.size(Paths.get(fileLocation));
|
||
|
//logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
|
||
|
} else
|
||
|
logger.error("No file was found with path: " + fileLocation);
|
||
|
} catch (Exception e) {
|
||
|
if ( md5 == null )
|
||
|
logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);
|
||
|
|
||
|
if ( size == null )
|
||
|
logger.error("Could not retrieve the size of the file: " + fileLocation);
|
||
|
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
} else
|
||
|
fileLocation = "File not retrieved";
|
||
|
} else
|
||
|
status = "non-accessible";
|
||
|
|
||
|
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, "more_info", md5, fileLocation, null);
|
||
|
AssignmentHandler.urlReports.add(new UrlReport(status, payload));
|
||
|
}
|
||
|
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
|
||
|
}
|
||
|
|
||
|
|
||
|
public static boolean connectWithUrlTest(String urlToCheck) {
|
||
|
try {
|
||
|
return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
|
||
|
} catch (Exception e) {
|
||
|
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false");
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
}
|