forked from lsmyrnaios/UrlsController
- Add support for excluding specific datasources from being crawled. These datasources may be aggregated through bulk-imports, by other pieces of software. Such a datasource is "arXiv.org".
- Fix an issue, where the "datasource-type" was retrieved instead of the "datasource-name". - Polish the "findAssignmentsQuery".
This commit is contained in:
parent
f835a752bf
commit
003c0bf179
|
@ -52,39 +52,65 @@ public class UrlsServiceImpl implements UrlsService {
|
||||||
|
|
||||||
private final AtomicInteger maxAttemptsPerRecordAtomic;
|
private final AtomicInteger maxAttemptsPerRecordAtomic;
|
||||||
|
|
||||||
|
private static String excludedDatasourceIDsStringList = null;
|
||||||
|
|
||||||
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
|
public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6);
|
||||||
|
|
||||||
|
|
||||||
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord) {
|
public UrlsServiceImpl(@Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") int maxAttemptsPerRecord,
|
||||||
|
@Value("${services.pdfaggregation.controller.datasources.excludedIDs}") List<String> excludedIDs) {
|
||||||
maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);
|
maxAttemptsPerRecordAtomic = new AtomicInteger(maxAttemptsPerRecord);
|
||||||
|
|
||||||
|
// The "excludedIDs" will not be null, as it will be defined inside the "application.yml" file.
|
||||||
|
// In case no IDs for excluded Datasources are given, then the "excludedIDs" list will just be empty.
|
||||||
|
int exclusionListSize = excludedIDs.size();
|
||||||
|
if ( exclusionListSize == 0 )
|
||||||
|
return; // So the "excludedDatasourceIDsStringList" -code should be placed last in this Constructor-method.
|
||||||
|
|
||||||
|
// Prepare the "excludedDatasourceIDsStringList" to be used inside the "findAssignmentsQuery". Create the following string-pattern:
|
||||||
|
// ("ID_1", "ID_2", ...)
|
||||||
|
|
||||||
|
final StringBuilder sb = new StringBuilder((exclusionListSize * 46) + (exclusionListSize -1) +2 );
|
||||||
|
sb.append("(");
|
||||||
|
for ( int i=0; i < exclusionListSize; ++i ) {
|
||||||
|
sb.append("\"").append(excludedIDs.get(i)).append("\"");
|
||||||
|
if ( i < (exclusionListSize -1) )
|
||||||
|
sb.append(",");
|
||||||
|
}
|
||||||
|
sb.append(")");
|
||||||
|
|
||||||
|
excludedDatasourceIDsStringList = sb.toString();
|
||||||
|
//logger.debug("excludedDatasourceIDsStringList :\n" + excludedDatasourceIDsStringList); // DEBUG!
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ResponseEntity<?> getAssignments(String workerId, int assignmentsLimit)
|
public ResponseEntity<?> getAssignments(String workerId, int assignmentsLimit)
|
||||||
{
|
{
|
||||||
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
||||||
|
String findAssignmentsQuery =
|
||||||
String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" +
|
"select pubid, url, datasourceid, datasourcename\n" +
|
||||||
"from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count\n" +
|
"from (select distinct pubid, url, datasourceid, datasourcename, attempt_count\n" +
|
||||||
"from (\n" +
|
" from (select p.id as pubid, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" +
|
||||||
"select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" +
|
" from " + ImpalaConnector.databaseName + ".publication p\n" +
|
||||||
"from " + ImpalaConnector.databaseName + ".publication p\n" +
|
" join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||||
"join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
" join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
||||||
"join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" +
|
" left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
||||||
"left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" +
|
" on attempts.id=p.id\n" +
|
||||||
"on attempts.id=p.id\n" +
|
" left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
||||||
"left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" +
|
" union all\n" +
|
||||||
"union all\n" +
|
" select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
|
||||||
"select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" +
|
" on existing.id=p.id and existing.original_url=pu.url\n" +
|
||||||
"on existing.id=p.id and existing.original_url=pu.url\n" +
|
" where d.allow_harvest=true and existing.id is null\n" +
|
||||||
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() +
|
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
|
||||||
"\nand not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
(" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") +
|
||||||
"and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
" and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" +
|
||||||
"limit " + (assignmentsLimit * 10) +
|
" and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" +
|
||||||
")\nas non_distinct_results\n" +
|
" and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing.
|
||||||
"order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
" limit " + (assignmentsLimit * 10) + ")\n" +
|
||||||
"limit " + assignmentsLimit +
|
" as non_distinct_results\n" +
|
||||||
"\n) as findAssignmentsQuery";
|
" order by coalesce(attempt_count, 0), reverse(pubid), url\n" +
|
||||||
|
" limit " + assignmentsLimit + ")\n" +
|
||||||
|
"as findAssignmentsQuery";
|
||||||
|
|
||||||
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
|
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
|
||||||
//logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!
|
//logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG!
|
||||||
|
|
|
@ -29,6 +29,13 @@ services:
|
||||||
shouldEmptyBucket: false
|
shouldEmptyBucket: false
|
||||||
shouldShowAllS3Buckets: true
|
shouldShowAllS3Buckets: true
|
||||||
|
|
||||||
|
datasources: # Provide a list of datasource IDs, which should be excluded from crawling. Their content is either bulk-imported or is known to be restricted.
|
||||||
|
excludedIDs: > # Use comma-seperated values (one in each line for best readability), as Spring has is currently incapable of parsing Dropwizard-styled lists.
|
||||||
|
opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
||||||
|
|
||||||
|
# Since we use a multi-line value from our list, we add the ID-explanations here (otherwise comments will be part of values):
|
||||||
|
# First-id: arXiv.org e-Print Archive
|
||||||
|
|
||||||
spring:
|
spring:
|
||||||
datasource:
|
datasource:
|
||||||
driver-class-name: com.cloudera.impala.jdbc41.Driver
|
driver-class-name: com.cloudera.impala.jdbc41.Driver
|
||||||
|
|
Loading…
Reference in New Issue