parent
33df46f6f5
commit
952bf7c035
12
build.gradle
12
build.gradle
|
@ -1,12 +1,14 @@
|
||||||
plugins {
|
plugins {
|
||||||
id 'org.springframework.boot' version '2.7.12'
|
id 'org.springframework.boot' version '2.7.13'
|
||||||
id 'io.spring.dependency-management' version '1.1.0'
|
id 'io.spring.dependency-management' version '1.1.0'
|
||||||
id 'java'
|
id 'java'
|
||||||
}
|
}
|
||||||
|
|
||||||
group = 'eu.openaire.urls_worker'
|
java {
|
||||||
version = '2.1.0-SNAPSHOT'
|
group = 'eu.openaire.urls_worker'
|
||||||
sourceCompatibility = '1.8'
|
version = '2.1.0-SNAPSHOT'
|
||||||
|
sourceCompatibility = '1.8'
|
||||||
|
}
|
||||||
|
|
||||||
repositories {
|
repositories {
|
||||||
mavenCentral()
|
mavenCentral()
|
||||||
|
@ -37,7 +39,7 @@ dependencies {
|
||||||
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
||||||
}
|
}
|
||||||
|
|
||||||
implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre'
|
implementation group: 'com.google.guava', name: 'guava', version: '32.1.1-jre'
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
||||||
implementation 'com.google.code.gson:gson:2.10.1'
|
implementation 'com.google.code.gson:gson:2.10.1'
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.1.1-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||||
networkTimeout=10000
|
networkTimeout=10000
|
||||||
|
validateDistributionUrl=true
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
|
|
|
@ -22,7 +22,7 @@ elif [[ $# -gt 2 ]]; then
|
||||||
echo -e "Wrong number of arguments given: ${#} (more than 2)\nPlease execute it like: script.sh <justRun: 0 | 1> <avoidReInstallingPublicationsRetriever: 0 | 1>"; exit 2
|
echo -e "Wrong number of arguments given: ${#} (more than 2)\nPlease execute it like: script.sh <justRun: 0 | 1> <avoidReInstallingPublicationsRetriever: 0 | 1>"; exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gradleVersion="8.1.1"
|
gradleVersion="8.2"
|
||||||
|
|
||||||
shouldBeCarefulWithMaxHeap=0 # This is NOT a cmd-arg.
|
shouldBeCarefulWithMaxHeap=0 # This is NOT a cmd-arg.
|
||||||
|
|
||||||
|
|
|
@ -195,7 +195,7 @@ public class AssignmentsHandler {
|
||||||
|
|
||||||
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
||||||
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
||||||
long idUrlPairsHandled = (numHandledAssignmentsBatches * this.maxAssignmentsLimitPerBatch);
|
long idUrlPairsHandled = (numHandledAssignmentsBatches * maxAssignmentsLimitPerBatch);
|
||||||
|
|
||||||
if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) {
|
if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) {
|
||||||
UrlUtils.duplicateUrls.clear();
|
UrlUtils.duplicateUrls.clear();
|
||||||
|
@ -212,11 +212,11 @@ public class AssignmentsHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( GeneralController.shouldShutdownWorker
|
if ( GeneralController.shouldShutdownWorker
|
||||||
|| (numHandledAssignmentsBatches == this.maxAssignmentsBatchesToHandleBeforeShutdown) )
|
|| (numHandledAssignmentsBatches == maxAssignmentsBatchesToHandleBeforeShutdown) )
|
||||||
{
|
{
|
||||||
logger.info("The worker will shutdown, after the full-texts are delivered to the Controller, as " + (GeneralController.shouldShutdownWorker
|
logger.info("The worker will shutdown, after the full-texts are delivered to the Controller, as " + (GeneralController.shouldShutdownWorker
|
||||||
? "it received a \"shutdownWorker\" request!"
|
? "it received a \"shutdownWorker\" request!"
|
||||||
: "the maximum assignments-batches (" + this.maxAssignmentsBatchesToHandleBeforeShutdown + ") to be handled was reached!"));
|
: "the maximum assignments-batches (" + maxAssignmentsBatchesToHandleBeforeShutdown + ") to be handled was reached!"));
|
||||||
|
|
||||||
// Here, just specify that we do not want to request for more assignments. A scheduling job will check if the fulltexts were delivered to the Controller and then shutdown the Worker.
|
// Here, just specify that we do not want to request for more assignments. A scheduling job will check if the fulltexts were delivered to the Controller and then shutdown the Worker.
|
||||||
shouldNotRequestMore = true;
|
shouldNotRequestMore = true;
|
||||||
|
@ -302,13 +302,12 @@ public class AssignmentsHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( sb != null ) {
|
if ( sb != null ) {
|
||||||
logger.debug("Before change:\n" + sb.toString()); // DEBUG!
|
logger.debug("Before change:\n" + sb); // DEBUG!
|
||||||
sb.setLength(0); // Reset it without re-sizing it.
|
sb.setLength(0); // Reset it without re-sizing it.
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> domains = new ArrayList<>(domainsWithAssignments.keySet());
|
List<String> domains = new ArrayList<>(domainsWithAssignments.keySet());
|
||||||
int domainsSize = domains.size();
|
int domainsSize = domains.size();
|
||||||
|
|
||||||
Integer domainsCounter = -1;
|
Integer domainsCounter = -1;
|
||||||
|
|
||||||
for ( int i = 0; i < assignmentsSize; ++i )
|
for ( int i = 0; i < assignmentsSize; ++i )
|
||||||
|
@ -324,7 +323,7 @@ public class AssignmentsHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( sb != null )
|
if ( sb != null )
|
||||||
logger.debug("After change:\n" + sb.toString());
|
logger.debug("After change:\n" + sb);
|
||||||
|
|
||||||
return spacedOutAssignments;
|
return spacedOutAssignments;
|
||||||
}
|
}
|
||||||
|
@ -337,13 +336,9 @@ public class AssignmentsHandler {
|
||||||
* */
|
* */
|
||||||
public static HashMap<Object, Integer> getFirstAvailableObjectForSpacedOutDomains(List<String> domainsList, Integer domainsCounter, HashMultimap<String, ?> domainsWithAssignments, int domainsSize, StringBuilder sb)
|
public static HashMap<Object, Integer> getFirstAvailableObjectForSpacedOutDomains(List<String> domainsList, Integer domainsCounter, HashMultimap<String, ?> domainsWithAssignments, int domainsSize, StringBuilder sb)
|
||||||
{
|
{
|
||||||
HashMap<Object, Integer> result = new HashMap<>();
|
|
||||||
Object nextAssignment = null;
|
|
||||||
|
|
||||||
// Normally, this method does not need a recursion-break-safety, as the initial-caller method should call this method exactly N times, where N is the number of all the values of "domainsWithAssignments".
|
// Normally, this method does not need a recursion-break-safety, as the initial-caller method should call this method exactly N times, where N is the number of all the values of "domainsWithAssignments".
|
||||||
// Although, for extra-safety and re-usability, let's have this check here.
|
// Although, for extra-safety and re-usability, let's have this check here.
|
||||||
Set<String> domainsSet = domainsWithAssignments.keySet();
|
if ( domainsWithAssignments.keySet().isEmpty() )
|
||||||
if ( domainsSet.isEmpty() )
|
|
||||||
return null; // Break recursion when the domains run-out.
|
return null; // Break recursion when the domains run-out.
|
||||||
|
|
||||||
if ( domainsCounter < (domainsSize -1) )
|
if ( domainsCounter < (domainsSize -1) )
|
||||||
|
@ -353,22 +348,21 @@ public class AssignmentsHandler {
|
||||||
|
|
||||||
String currentDomain = domainsList.get(domainsCounter);
|
String currentDomain = domainsList.get(domainsCounter);
|
||||||
Set<?> assignmentsOfCurrentDomain = domainsWithAssignments.get(currentDomain);
|
Set<?> assignmentsOfCurrentDomain = domainsWithAssignments.get(currentDomain);
|
||||||
if ( assignmentsOfCurrentDomain.isEmpty() ) {
|
if ( assignmentsOfCurrentDomain.isEmpty() ) // This domain is out of assignments, check the next available one.
|
||||||
// This domain is out of assignments, check the next available one.
|
return getFirstAvailableObjectForSpacedOutDomains(domainsList, domainsCounter, domainsWithAssignments, domainsSize, sb);
|
||||||
result = getFirstAvailableObjectForSpacedOutDomains(domainsList, domainsCounter, domainsWithAssignments, domainsSize, sb);
|
|
||||||
} else {
|
Object nextAssignment = assignmentsOfCurrentDomain.toArray()[0];
|
||||||
nextAssignment = assignmentsOfCurrentDomain.toArray()[0];
|
HashMap<Object, Integer> result = new HashMap<>();
|
||||||
result.put(nextAssignment, domainsCounter);
|
result.put(nextAssignment, domainsCounter);
|
||||||
domainsWithAssignments.remove(currentDomain, nextAssignment);
|
domainsWithAssignments.remove(currentDomain, nextAssignment);
|
||||||
if ( sb != null )
|
if ( sb != null )
|
||||||
sb.append(currentDomain).append("\n"); // DEBUG!
|
sb.append(currentDomain).append("\n"); // DEBUG!
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Lock fileWriteLock = new ReentrantLock(true);
|
private static final Lock fileWriteLock = new ReentrantLock(true);
|
||||||
|
|
||||||
public String writeToFile(String fileFullPath, String stringToWrite, boolean shouldLockThreads)
|
public String writeToFile(String fileFullPath, String stringToWrite, boolean shouldLockThreads)
|
||||||
{
|
{
|
||||||
|
|
|
@ -7,9 +7,7 @@ import eu.openaire.publications_retriever.util.http.HttpConnUtils;
|
||||||
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
||||||
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
|
||||||
import eu.openaire.urls_worker.components.AssignmentsHandler;
|
import eu.openaire.urls_worker.components.AssignmentsHandler;
|
||||||
import eu.openaire.urls_worker.components.ConnWithController;
|
|
||||||
import eu.openaire.urls_worker.controllers.GeneralController;
|
import eu.openaire.urls_worker.controllers.GeneralController;
|
||||||
import eu.openaire.urls_worker.models.Assignment;
|
import eu.openaire.urls_worker.models.Assignment;
|
||||||
import eu.openaire.urls_worker.models.Error;
|
import eu.openaire.urls_worker.models.Error;
|
||||||
|
@ -18,13 +16,11 @@ import eu.openaire.urls_worker.models.UrlReport;
|
||||||
import eu.openaire.urls_worker.services.FileStorageService;
|
import eu.openaire.urls_worker.services.FileStorageService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.net.CookieStore;
|
import java.net.CookieStore;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.sql.Timestamp;
|
import java.sql.Timestamp;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
@ -39,10 +35,6 @@ public class PublicationsRetrieverPlugin {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
|
private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class);
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private ConnWithController connWithController;
|
|
||||||
|
|
||||||
|
|
||||||
public static String assignmentsBasePath;
|
public static String assignmentsBasePath;
|
||||||
|
|
||||||
private static String workerId;
|
private static String workerId;
|
||||||
|
@ -179,6 +171,9 @@ public class PublicationsRetrieverPlugin {
|
||||||
private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
|
private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
|
||||||
private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length();
|
private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length();
|
||||||
|
|
||||||
|
private static final String provenance = "crawl:PublicationsRetriever";
|
||||||
|
|
||||||
|
|
||||||
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
||||||
{
|
{
|
||||||
if ( FileUtils.dataToBeLoggedList.size() != assignments.size() ) {
|
if ( FileUtils.dataToBeLoggedList.size() != assignments.size() ) {
|
||||||
|
@ -246,6 +241,7 @@ public class PublicationsRetrieverPlugin {
|
||||||
if ( !foundAlreadyDownloadedFullText ) {
|
if ( !foundAlreadyDownloadedFullText ) {
|
||||||
String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!");
|
String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!");
|
||||||
error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future.
|
error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future.
|
||||||
|
// The "fileLocation" is null.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error.
|
else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error.
|
||||||
|
@ -279,7 +275,7 @@ public class PublicationsRetrieverPlugin {
|
||||||
// Each record will have the urlID, the datasourceID and possibly one filename, which may contain a different urlID.
|
// Each record will have the urlID, the datasourceID and possibly one filename, which may contain a different urlID.
|
||||||
// The Controller will select the correct datasourceID for before adding it inside the S3-ObjectStore filename.
|
// The Controller will select the correct datasourceID for before adding it inside the S3-ObjectStore filename.
|
||||||
|
|
||||||
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId);
|
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, provenance, datasourceId);
|
||||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||||
|
|
||||||
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||||
|
|
Loading…
Reference in New Issue