- Exclude empty and null urls in the assignments.
- Update the "getFullTextsImproved"-call to "getFullTexts", now that the "improved" version is stable. - Update Gradle. - Code polishing.
This commit is contained in:
parent
2253f05bf5
commit
a1c16ffc19
|
@ -25,4 +25,4 @@ If you want to build and run the app on a **Docker Container**, then run the scr
|
||||||
|
|
||||||
Implementation notes:
|
Implementation notes:
|
||||||
- For transferring the full-text files, we use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
|
- For transferring the full-text files, we use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
|
||||||
- The names of the uploaded full-text files ae of the following form: "***datasourceID/recordId::fileHash.pdf***"
|
- The uploaded full-text files follow this naming-scheme: "**datasourceID/recordId::fileHash.pdf**"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-bin.zip
|
||||||
networkTimeout=10000
|
networkTimeout=10000
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
|
|
|
@ -26,7 +26,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then
|
||||||
justInstall=0
|
justInstall=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gradleVersion="7.6"
|
gradleVersion="8.0"
|
||||||
|
|
||||||
if [[ justInstall -eq 0 ]]; then
|
if [[ justInstall -eq 0 ]]; then
|
||||||
|
|
||||||
|
|
|
@ -94,21 +94,23 @@ public class UrlController {
|
||||||
"from " + ImpalaConnector.databaseName + ".publication p\n" +
|
"from " + ImpalaConnector.databaseName + ".publication p\n" +
|
||||||
"join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
"join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||||
"join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
"join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
||||||
"left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts on attempts.id=p.id\n" +
|
"left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
||||||
|
"on attempts.id=p.id\n" +
|
||||||
"left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
"left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
||||||
"union all\n" +
|
"union all\n" +
|
||||||
"select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl)\n" +
|
"select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
|
||||||
"as existing on existing.id=p.id and existing.original_url=pu.url\n" +
|
"on existing.id=p.id and existing.original_url=pu.url\n" +
|
||||||
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
|
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
|
||||||
"\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
"\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
||||||
"limit " + (assignmentsLimit * 10) + ")\n" +
|
"and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
||||||
"as non_distinct_results\n" +
|
"limit " + (assignmentsLimit * 10) +
|
||||||
|
")\nas non_distinct_results\n" +
|
||||||
"order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
"order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
||||||
"limit " + assignmentsLimit + ")\n" +
|
"limit " + assignmentsLimit +
|
||||||
"as findAssignmentsQuery";
|
"\n) as findAssignmentsQuery";
|
||||||
|
|
||||||
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
|
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
|
||||||
//logger.debug(findAssignmentsQuery); // DEBUG!
|
//logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!
|
||||||
|
|
||||||
String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment";
|
String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment";
|
||||||
|
|
||||||
|
|
|
@ -67,4 +67,8 @@ public class StatsService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// To get the human-friendly timestamp format from the BigInt in the database:
|
||||||
|
// select from_timestamp(CAST(CAST(`date` as decimal(30,0))/1000 AS timestamp), "yyyy-MM-dd HH:mm:ss.SSS") from payload
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -237,7 +237,7 @@ public class FileUtils {
|
||||||
logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches (" + numOfFullTextsPerBatch + " files each).");
|
logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches (" + numOfFullTextsPerBatch + " files each).");
|
||||||
|
|
||||||
// Check if one full text is left out because of the division. Put it int the last batch.
|
// Check if one full text is left out because of the division. Put it int the last batch.
|
||||||
String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTextsImproved/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
|
String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
|
||||||
|
|
||||||
// TODO - The worker should send the port in which it accepts requests, along with the current request.
|
// TODO - The worker should send the port in which it accepts requests, along with the current request.
|
||||||
// TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.
|
// TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.
|
||||||
|
|
Loading…
Reference in New Issue