UrlsWorker/src/main/java/eu/openaire/urls_worker/plugins/publicationsRetriever/PublicationsRetrieverPlugin...

package eu.openaire.urls_worker.plugins.publicationsRetriever;

import com.google.common.hash.Hashing;
import com.google.common.io.Files;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.models.Payload;
import eu.openaire.urls_worker.models.Task;
import eu.openaire.urls_worker.models.UrlReport;
import eu.openaire.urls_worker.util.AssignmentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;


public class PublicationsRetrieverPlugin {

    private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);

    private static final String workingDir = System.getProperty("user.dir") + File.separator;
    private static String assignmentsBasePath = workingDir + "assignments" + File.separator;
    private static String assignmentsBaseFullTextsPath = assignmentsBasePath + "fullTexts" + File.separator;

    static {
        File assignmentsDir = new File(assignmentsBaseFullTextsPath);
        if ( !assignmentsDir.exists() ) {
            if ( !assignmentsDir.mkdirs() ) {   // Create the directory.
                logger.error("Could not create the \"assignments directories\": \"" + assignmentsBaseFullTextsPath + "\". Using the \"workingDir\" instead (" + workingDir + ").");
                assignmentsBasePath = workingDir;
                assignmentsBaseFullTextsPath = assignmentsBasePath;
            }
        }

        // Specify some configurations
        LoaderAndChecker.retrieveDocuments = true;
        LoaderAndChecker.retrieveDatasets = false;
        FileUtils.shouldDownloadDocFiles = true;
        PublicationsRetriever.targetUrlType = "docUrl";

        int workerThreadsCount = Runtime.getRuntime().availableProcessors() * PublicationsRetriever.threadsMultiplier;
        logger.info("Use " + workerThreadsCount + " worker-threads.");
        PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);
    }


    public static void processTasks(int assignmentId, Collection<Task> tasks) throws RuntimeException, FileNotFoundException
    {
        ConnSupportUtils.setKnownMimeTypes();

        FileUtils.storeDocFilesDir = assignmentsBaseFullTextsPath + "assignment_" + assignmentId + "_fullTexts" + File.separator;  // It needs the last separator, because of how the docFiles are named and stored.

        FileUtils.setOutput(new FileOutputStream(assignmentsBasePath + "assignment_" + assignmentId + "_generic_results.json"));

        int tasksSize = tasks.size();
        int batchCount = 0;
        int taskCount = 0;
        List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);

        // Start loading and checking urls.
        for ( Task task : tasks )
        {
            callableTasks.add(() -> {
                String id = task.getId();
                String url = task.getUrl();

                if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null ) {
                    return false;
                }    // The "url" might have changed (inside "handleUrlChecks()").

                String urlToCheck = url;
                String sourceUrl = urlToCheck;    // Hold it here for the logging-messages.
                if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
                    logger.warn("Could not canonicalize url: " + sourceUrl);
                    UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false");
                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                    return false;
                }

                if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) {    // If we got into an already-found docUrl, log it and return.
                    ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, logger, true);
                    return true;
                }

                boolean isPossibleDocOrDatasetUrl = false;    // Used for specific connection settings.
                String lowerCaseRetrievedUrl = url.toLowerCase();
                // Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.
                if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())
                        || (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {
                    //logger.debug("Possible docUrl or datasetUrl: " + url);
                    isPossibleDocOrDatasetUrl = true;
                }

                try {    // Check if it's a docUrl, if not, it gets crawled.
                    HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
                } catch (Exception e) {
                    String wasUrlValid = "true";
                    if ( e instanceof RuntimeException ) {
                        String message = e.getMessage();
                        if ( (message != null) && message.contains("HTTP 404 Client Error") )
                            wasUrlValid = "false";
                    }
                    UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false");
                }
                return true;
            });

            if ( ((++taskCount) >= FileUtils.jsonBatchSize) || (taskCount >= tasksSize) )
            {
                logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
                LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
                addUrlReportsToWorkerReport();
                callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
            }
        }// end tasks-for-loop
    }


    public static void addUrlReportsToWorkerReport()
    {
        for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
        {
            String status = null, fileLocation = null, md5 = null;
            Long size = null;
            if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
            {
                status = "accessible";
                fileLocation = data.getComment();
                if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {
                    // The file of this docUrl was already downloaded by another docUrl.
                    String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);
                    //logger.debug("previousId: " + previousId);    // DEBUG!
                    // Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
                    for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
                        if ( data_2.getUrlId().equals(previousId) && data_2.getWasDocumentOrDatasetAccessible().equals("true") ) {
                            fileLocation = data_2.getComment();
                            break;
                        }
                    }
                }
                else if ( fileLocation.contains("DocFileNotRetrievedException") )
                    fileLocation = null;

                if ( fileLocation != null ) {
                    try {
                        File docFile = new File(fileLocation);
                        if ( docFile.isFile() ) {
                            md5 = Files.hash(docFile, Hashing.md5()).toString();    // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
                            //logger.debug("MD5 for file \"" + docFile.getName() + "\": " + md5); // DEBUG!
                            size = java.nio.file.Files.size(Paths.get(fileLocation));
                            //logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
                        } else
                            logger.error("No file was found with path: " + fileLocation);
                    } catch (Exception e) {
                        if ( md5 == null )
                            logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);

                        if ( size == null )
                            logger.error("Could not retrieve the size of the file: " + fileLocation);

                        e.printStackTrace();
                    }
                } else
                    fileLocation = "File not retrieved";
            } else
                status = "non-accessible";

            Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, "more_info", md5, fileLocation, null);
            AssignmentHandler.urlReports.add(new UrlReport(status, payload));
        }
        FileUtils.dataToBeLoggedList.clear();   // Empty the list, to be re-populated by the next batch / assignment.
    }


    public static boolean connectWithUrlTest(String urlToCheck) {
        try {
            return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false);    // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
        } catch (Exception e) {
            UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false");
            return false;
        }
    }
}
Integrate the "PublicationsRetriever" program as a plugin, which downloads the full-texts of the publications. Afterwards, the retrieved data info is transferred to the Controller. The "PublicationsRetriever" can be installed locally as a library, using the "installPublicationsRetriever.sh" script. 2021-06-22 04:58:07 +02:00			`package eu.openaire.urls_worker.plugins.publicationsRetriever;`

			`import com.google.common.hash.Hashing;`
			`import com.google.common.io.Files;`
			`import edu.uci.ics.crawler4j.url.URLCanonicalizer;`
			`import eu.openaire.publications_retriever.PublicationsRetriever;`
			`import eu.openaire.publications_retriever.util.file.FileUtils;`
			`import eu.openaire.publications_retriever.util.http.ConnSupportUtils;`
			`import eu.openaire.publications_retriever.util.http.HttpConnUtils;`
			`import eu.openaire.publications_retriever.util.url.DataToBeLogged;`
			`import eu.openaire.publications_retriever.util.url.LoaderAndChecker;`
			`import eu.openaire.publications_retriever.util.url.UrlUtils;`
			`import eu.openaire.urls_worker.models.Payload;`
			`import eu.openaire.urls_worker.models.Task;`
			`import eu.openaire.urls_worker.models.UrlReport;`
			`import eu.openaire.urls_worker.util.AssignmentHandler;`
			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

			`import java.io.File;`
			`import java.io.FileNotFoundException;`
			`import java.io.FileOutputStream;`
			`import java.nio.charset.StandardCharsets;`
			`import java.nio.file.Paths;`
			`import java.util.*;`
			`import java.util.concurrent.Callable;`
			`import java.util.concurrent.Executors;`


			`public class PublicationsRetrieverPlugin {`

			`private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);`

			`private static final String workingDir = System.getProperty("user.dir") + File.separator;`
			`private static String assignmentsBasePath = workingDir + "assignments" + File.separator;`
			`private static String assignmentsBaseFullTextsPath = assignmentsBasePath + "fullTexts" + File.separator;`

			`static {`
			`File assignmentsDir = new File(assignmentsBaseFullTextsPath);`
			`if ( !assignmentsDir.exists() ) {`
			`if ( !assignmentsDir.mkdirs() ) { // Create the directory.`
			`logger.error("Could not create the \"assignments directories\": \"" + assignmentsBaseFullTextsPath + "\". Using the \"workingDir\" instead (" + workingDir + ").");`
			`assignmentsBasePath = workingDir;`
			`assignmentsBaseFullTextsPath = assignmentsBasePath;`
			`}`
			`}`

			`// Specify some configurations`
			`LoaderAndChecker.retrieveDocuments = true;`
			`LoaderAndChecker.retrieveDatasets = false;`
			`FileUtils.shouldDownloadDocFiles = true;`
			`PublicationsRetriever.targetUrlType = "docUrl";`

			`int workerThreadsCount = Runtime.getRuntime().availableProcessors() * PublicationsRetriever.threadsMultiplier;`
			`logger.info("Use " + workerThreadsCount + " worker-threads.");`
			`PublicationsRetriever.executor = Executors.newFixedThreadPool(workerThreadsCount);`
			`}`


			`public static void processTasks(int assignmentId, Collection<Task> tasks) throws RuntimeException, FileNotFoundException`
			`{`
			`ConnSupportUtils.setKnownMimeTypes();`

			`FileUtils.storeDocFilesDir = assignmentsBaseFullTextsPath + "assignment_" + assignmentId + "_fullTexts" + File.separator; // It needs the last separator, because of how the docFiles are named and stored.`

			`FileUtils.setOutput(new FileOutputStream(assignmentsBasePath + "assignment_" + assignmentId + "_generic_results.json"));`

			`int tasksSize = tasks.size();`
			`int batchCount = 0;`
			`int taskCount = 0;`
			`List<Callable<Boolean>> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize);`

			`// Start loading and checking urls.`
			`for ( Task task : tasks )`
			`{`
			`callableTasks.add(() -> {`
			`String id = task.getId();`
			`String url = task.getUrl();`

			`if ( (url = LoaderAndChecker.handleUrlChecks(id, url)) == null ) {`
			`return false;`
			`} // The "url" might have changed (inside "handleUrlChecks()").`

			`String urlToCheck = url;`
			`String sourceUrl = urlToCheck; // Hold it here for the logging-messages.`
			`if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {`
			`logger.warn("Could not canonicalize url: " + sourceUrl);`
			`UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false");`
			`LoaderAndChecker.connProblematicUrls.incrementAndGet();`
			`return false;`
			`}`

			`if ( UrlUtils.docOrDatasetUrlsWithIDs.containsKey(url) ) { // If we got into an already-found docUrl, log it and return.`
			`ConnSupportUtils.handleReCrossedDocUrl(id, url, url, url, logger, true);`
			`return true;`
			`}`

			`boolean isPossibleDocOrDatasetUrl = false; // Used for specific connection settings.`
			`String lowerCaseRetrievedUrl = url.toLowerCase();`
			`// Check if it's a possible-DocUrl, if so, this info will be used for optimal web-connection later.`
			`if ( (LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches())`
			`\|\| (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseRetrievedUrl).matches()) ) {`
			`//logger.debug("Possible docUrl or datasetUrl: " + url);`
			`isPossibleDocOrDatasetUrl = true;`
			`}`

			`try { // Check if it's a docUrl, if not, it gets crawled.`
			`HttpConnUtils.connectAndCheckMimeType(id, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);`
			`} catch (Exception e) {`
			`String wasUrlValid = "true";`
			`if ( e instanceof RuntimeException ) {`
			`String message = e.getMessage();`
			`if ( (message != null) && message.contains("HTTP 404 Client Error") )`
			`wasUrlValid = "false";`
			`}`
			`UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false");`
			`}`
			`return true;`
			`});`

			`if ( ((++taskCount) >= FileUtils.jsonBatchSize) \|\| (taskCount >= tasksSize) )`
			`{`
			`logger.info("Batch counter: " + (++batchCount) + " \| progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% \| every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");`
			`LoaderAndChecker.invokeAllTasksAndWait(callableTasks);`
			`addUrlReportsToWorkerReport();`
			`callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.`
			`}`
			`}// end tasks-for-loop`
			`}`


			`public static void addUrlReportsToWorkerReport()`
			`{`
			`for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )`
			`{`
			`String status = null, fileLocation = null, md5 = null;`
			`Long size = null;`
			`if ( data.getWasDocumentOrDatasetAccessible().equals("true") )`
			`{`
			`status = "accessible";`
			`fileLocation = data.getComment();`
			`if ( fileLocation.contains(UrlUtils.alreadyDownloadedByIDMessage) ) {`
			`// The file of this docUrl was already downloaded by another docUrl.`
			`String previousId = fileLocation.substring(UrlUtils.alreadyDownloadedByIDMessage.length() +1);`
			`//logger.debug("previousId: " + previousId); // DEBUG!`
			`// Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.`
			`for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {`
			`if ( data_2.getUrlId().equals(previousId) && data_2.getWasDocumentOrDatasetAccessible().equals("true") ) {`
			`fileLocation = data_2.getComment();`
			`break;`
			`}`
			`}`
			`}`
			`else if ( fileLocation.contains("DocFileNotRetrievedException") )`
			`fileLocation = null;`

			`if ( fileLocation != null ) {`
			`try {`
			`File docFile = new File(fileLocation);`
			`if ( docFile.isFile() ) {`
			`md5 = Files.hash(docFile, Hashing.md5()).toString(); // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.`
			`//logger.debug("MD5 for file \"" + docFile.getName() + "\": " + md5); // DEBUG!`
			`size = java.nio.file.Files.size(Paths.get(fileLocation));`
			`//logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!`
			`} else`
			`logger.error("No file was found with path: " + fileLocation);`
			`} catch (Exception e) {`
			`if ( md5 == null )`
			`logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);`

			`if ( size == null )`
			`logger.error("Could not retrieve the size of the file: " + fileLocation);`

			`e.printStackTrace();`
			`}`
			`} else`
			`fileLocation = "File not retrieved";`
			`} else`
			`status = "non-accessible";`

			`Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), data.getDocOrDatasetUrl(), new Date(), "application/pdf", size, "more_info", md5, fileLocation, null);`
			`AssignmentHandler.urlReports.add(new UrlReport(status, payload));`
			`}`
			`FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.`
			`}`


			`public static boolean connectWithUrlTest(String urlToCheck) {`
			`try {`
			`return HttpConnUtils.connectAndCheckMimeType("null", urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.`
			`} catch (Exception e) {`
			`UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false");`
			`return false;`
			`}`
			`}`
			`}`