- Add support for excluding specific datasources from being crawled. These datasources may be aggregated through bulk-imports, by other pieces of software. Such a datasource is "arXiv.org".

- Fix an issue, where the "datasource-type" was retrieved instead of the "datasource-name".
- Polish the "findAssignmentsQuery".
This commit is contained in:
Lampros Smyrnaios 2023-03-21 07:19:35 +02:00
parent f835a752bf
commit 003c0bf179
2 changed files with 56 additions and 23 deletions

View File

@ -52,39 +52,65 @@ public class UrlsServiceImpl implements UrlsService {
private final AtomicInteger maxAttemptsPerRecordAtomic;
private static String excludedDatasourceIDsStringList = null;
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord) {
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord,
@Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List<String> excludedIDs) {
maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);
// The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file.
// In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty.
int exclusionListSize = excludedIDs.size();
if ( exclusionListSize == 0 )
return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method.
// Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern:
// ("ID_1", "ID_2", ...)
final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 );
sb.append("(");
for ( int i=0; i < exclusionListSize; ++i ) {
sb.append("\"").append(excludedIDs.get(i)).append("\"");
if ( i < (exclusionListSize -1) )
sb.append(",");
}
sb.append(")");
excludedDatasourceIDsStringList = sb.toString();
//logger.debug("excludedDatasourceIDsStringList :\n" + excludedDatasourceIDsStringList); // DEBUG!
}
public ResponseEntity<?> getAssignments(String workerId, int assignmentsLimit)
{
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" +
"from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" +
"from (\n" +
"select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" +
"from " + ImpalaConnector.databaseName + ".publication p\n" +
"join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
"join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
"left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
"on attempts.id=p.id\n" +
"left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
"union all\n" +
"select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
"on existing.id=p.id and existing.original_url=pu.url\n" +
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
"\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
"and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
"limit " + (assignmentsLimit * 10) +
")\nas non_distinct_results\n" +
"order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
"limit " + assignmentsLimit +
"\n) as findAssignmentsQuery";
String findAssignmentsQuery =
"select pubid, url, datasourceid, datasourcename\n" +
"from (select distinct pubid, url, datasourceid, datasourcename, attempt_count\n" +
" from (select p.id as pubid, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
" from " + ImpalaConnector.databaseName + ".publication p\n" +
" join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
" join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
" left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
" on attempts.id=p.id\n" +
" left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
" union all\n" +
" select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
" on existing.id=p.id and existing.original_url=pu.url\n" +
" where d.allow_harvest=true and existing.id is null\n" +
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
" and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
" and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
" limit " + (assignmentsLimit * 10) + ")\n" +
" as non_distinct_results\n" +
" order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
" limit " + assignmentsLimit + ")\n" +
"as findAssignmentsQuery";
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
//logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!

View File

@ -29,6 +29,13 @@ services:
shouldEmptyBucket: false
shouldShowAllS3Buckets: true
datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists.
opendoar____::6f4922f45568161a8cdf4ad2299f6d23
# Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
# First-id: arXiv.org e-Print Archive
spring:
datasource:
driver-class-name: com.cloudera.impala.jdbc41.Driver