forked from lsmyrnaios/UrlsController
- Optimize the JOIN-order in the "findAssignmentsQuery".
- Optimize the "DOC_URL_FILTER"-regex. - Update dependencies.
This commit is contained in:
parent
43ea64758d
commit
dd394f18a0
10
build.gradle
10
build.gradle
|
@ -49,12 +49,12 @@ dependencies {
|
|||
implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.14.0'
|
||||
|
||||
// https://mvnrepository.com/artifact/org.apache.commons/commons-compress
|
||||
implementation("org.apache.commons:commons-compress:1.25.0") {
|
||||
implementation("org.apache.commons:commons-compress:1.26.1") {
|
||||
exclude group: 'com.github.luben', module: 'zstd-jni'
|
||||
}
|
||||
implementation 'com.github.luben:zstd-jni:1.5.5-11' // Even though this is part of the above dependency, the Apache commons rarely updates it, while the zstd team makes improvements very often.
|
||||
|
||||
implementation 'io.minio:minio:8.5.7'
|
||||
implementation 'io.minio:minio:8.5.9'
|
||||
|
||||
// https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
|
||||
implementation("com.cloudera.impala:jdbc:2.5.31") {
|
||||
|
@ -110,17 +110,17 @@ dependencies {
|
|||
|
||||
// Add back some updated version of the needed dependencies.
|
||||
implementation 'org.apache.thrift:libthrift:0.17.0' // Newer versions (>=0.18.X) are not compatible with JAVA 8.
|
||||
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.0'
|
||||
implementation 'com.fasterxml.woodstox:woodstox-core:6.6.1'
|
||||
|
||||
// https://mvnrepository.com/artifact/org.json/json
|
||||
implementation 'org.json:json:20231013'
|
||||
implementation 'org.json:json:20240303'
|
||||
|
||||
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
||||
implementation 'com.google.code.gson:gson:2.10.1'
|
||||
|
||||
|
||||
// https://mvnrepository.com/artifact/io.micrometer/micrometer-registry-prometheus
|
||||
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.2'
|
||||
runtimeOnly 'io.micrometer:micrometer-registry-prometheus:1.12.3'
|
||||
|
||||
testImplementation 'org.springframework.security:spring-security-test'
|
||||
testImplementation "org.springframework.boot:spring-boot-starter-test"
|
||||
|
|
|
@ -68,7 +68,7 @@ public class UrlsServiceImpl implements UrlsService {
|
|||
private static String excludedDatasourceIDsStringList = null;
|
||||
|
||||
|
||||
private static final String DOC_URL_FILTER = ".+(?:pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|viewfile|viewdoc|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*";
|
||||
private static final String DOC_URL_FILTER = ".+(?:pdf|download|/doc|document|(?:/|[?]|&)file|/fulltext|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[ée]l[ée]charger|descargar).*";
|
||||
// "DOC_URL_FILTER" works for lowerCase Strings (we use the "ignore-case" indicator in the "regexp_like()" method).
|
||||
|
||||
|
||||
|
@ -122,11 +122,11 @@ public class UrlsServiceImpl implements UrlsService {
|
|||
String findAssignmentsQuery =
|
||||
"select pubid, url, datasourceid, datasourcename\n" + // Select the final sorted data with "assignmentsLimit".
|
||||
"from (select distinct p.id as pubid, pu.url as url, pb.level as level, attempts.counts as attempt_count, p.year as pub_year, d.id as datasourceid, d.name as datasourcename\n" + // Select the distinct id-url data. Beware that this will return duplicate id-url pairs, wince one pair may be associated with multiple datasources.
|
||||
" from " + DatabaseConnector.databaseName + ".publication p\n" +
|
||||
" from " + DatabaseConnector.databaseName + ".publication_urls pu\n" +
|
||||
" join " + DatabaseConnector.databaseName + ".publication p on p.id=pu.id\n" +
|
||||
" join " + DatabaseConnector.databaseName + ".datasource d on d.id=p.datasourceid and d.allow_harvest=true"+
|
||||
((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below.
|
||||
(" and d.id not in " + excludedDatasourceIDsStringList + GenericUtils.endOfLine) : "") +
|
||||
" join " + DatabaseConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" +
|
||||
" left anti join (select a.original_url from " + DatabaseConnector.databaseName + ".assignment a\n" +
|
||||
" union all\n" +
|
||||
" select pl.original_url from " + DatabaseConnector.databaseName + ".payload pl\n" + // Here we access the payload-VIEW which includes the three payload-tables.
|
||||
|
|
Loading…
Reference in New Issue