forked from lsmyrnaios/UrlsController
- Add support for excluding specific datasources from being crawled. These datasources may be aggregated through bulk-imports, by other pieces of software. Such a datasource is "arXiv.org".
- Fix an issue, where the "datasource-type" was retrieved instead of the "datasource-name". - Polish the "findAssignmentsQuery".
This commit is contained in:
parent
f835a752bf
commit
003c0bf179
|
@ -52,39 +52,65 @@ public class UrlsServiceImpl implements UrlsService {
|
|||
|
||||
private final AtomicInteger maxAttemptsPerRecordAtomic;
|
||||
|
||||
private static String excludedDatasourceIDsStringList = null;
|
||||
|
||||
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
|
||||
|
||||
|
||||
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord) {
|
||||
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord,
|
||||
@Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List<String> excludedIDs) {
|
||||
maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);
|
||||
|
||||
// The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file.
|
||||
// In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty.
|
||||
int exclusionListSize = excludedIDs.size();
|
||||
if ( exclusionListSize == 0 )
|
||||
return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method.
|
||||
|
||||
// Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern:
|
||||
// ("ID_1", "ID_2", ...)
|
||||
|
||||
final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 );
|
||||
sb.append("(");
|
||||
for ( int i=0; i < exclusionListSize; ++i ) {
|
||||
sb.append("\"").append(excludedIDs.get(i)).append("\"");
|
||||
if ( i < (exclusionListSize -1) )
|
||||
sb.append(",");
|
||||
}
|
||||
sb.append(")");
|
||||
|
||||
excludedDatasourceIDsStringList = sb.toString();
|
||||
//logger.debug("excludedDatasourceIDsStringList :\n" + excludedDatasourceIDsStringList); // DEBUG!
|
||||
}
|
||||
|
||||
|
||||
public ResponseEntity<?> getAssignments(String workerId, int assignmentsLimit)
|
||||
{
|
||||
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
||||
|
||||
String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" +
|
||||
"from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" +
|
||||
"from (\n" +
|
||||
"select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" +
|
||||
"from " + ImpalaConnector.databaseName + ".publication p\n" +
|
||||
"join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||
"join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
||||
"left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
||||
"on attempts.id=p.id\n" +
|
||||
"left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
||||
"union all\n" +
|
||||
"select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
|
||||
"on existing.id=p.id and existing.original_url=pu.url\n" +
|
||||
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
|
||||
"\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
||||
"and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
||||
"limit " + (assignmentsLimit * 10) +
|
||||
")\nas non_distinct_results\n" +
|
||||
"order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
||||
"limit " + assignmentsLimit +
|
||||
"\n) as findAssignmentsQuery";
|
||||
String findAssignmentsQuery =
|
||||
"select pubid, url, datasourceid, datasourcename\n" +
|
||||
"from (select distinct pubid, url, datasourceid, datasourcename, attempt_count\n" +
|
||||
" from (select p.id as pubid, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
|
||||
" from " + ImpalaConnector.databaseName + ".publication p\n" +
|
||||
" join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||
" join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
||||
" left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
||||
" on attempts.id=p.id\n" +
|
||||
" left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
||||
" union all\n" +
|
||||
" select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
|
||||
" on existing.id=p.id and existing.original_url=pu.url\n" +
|
||||
" where d.allow_harvest=true and existing.id is null\n" +
|
||||
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
|
||||
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
|
||||
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
|
||||
" and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
||||
" and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
||||
" limit " + (assignmentsLimit * 10) + ")\n" +
|
||||
" as non_distinct_results\n" +
|
||||
" order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
||||
" limit " + assignmentsLimit + ")\n" +
|
||||
"as findAssignmentsQuery";
|
||||
|
||||
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
|
||||
//logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!
|
||||
|
|
|
@ -29,6 +29,13 @@ services:
|
|||
shouldEmptyBucket: false
|
||||
shouldShowAllS3Buckets: true
|
||||
|
||||
datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
|
||||
excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists.
|
||||
opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
||||
|
||||
# Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
|
||||
# First-id: arXiv.org e-Print Archive
|
||||
|
||||
spring:
|
||||
datasource:
|
||||
driver-class-name: com.cloudera.impala.jdbc41.Driver
|
||||
|
|
Loading…
Reference in New Issue