diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index 9e562ee..3c13be0 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -47,6 +47,7 @@ public class ImpalaConnector { } catch (Exception e) { logger.error("Error testing if database supports batch updates!", e); } + createDatabase(); // In case of an exception, the App will exit with the stacktrace. } @@ -66,6 +67,10 @@ public class ImpalaConnector { jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_urls stored as parquet as select * from " + initialDatabaseName + ".publication_urls"); jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_urls"); + + // TODO - Should we add the "publication_oids" table? Will it be used in the "getAssignments" query? + + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".datasource stored as parquet as select * from " + initialDatabaseName + ".datasource"); jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".datasource"); diff --git a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java index 3e0bed3..76b1680 100644 --- a/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/UrlsServiceImpl.java @@ -86,7 +86,7 @@ public class UrlsServiceImpl implements UrlsService { { // Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >. String findAssignmentsQuery = - "select pubid, url, datasourceid, datasourcename\n" + + "select pubid, url, datasourceid, datasourcename\n" + // The datsourceName is currently not used. It may be used by the Worker, in the future to apply a datasource-specific aggregation plugin to take the full-texts quickly, instead of using the general crawling one. "from (select distinct pubid, url, datasourceid, datasourcename, attempt_count, pub_year\n" + " from (select p.id as pubid, p.year as pub_year, pu.url as url, d.id as datasourceid, d.name as datasourcename, attempts.counts as attempt_count\n" + " from " + ImpalaConnector.databaseName + ".publication p\n" + @@ -94,17 +94,19 @@ public class UrlsServiceImpl implements UrlsService { " join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + " left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts\n" + " on attempts.id=p.id\n" + - " left outer join (select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + - " union all\n" + - " select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing\n" + // Here we access the payload-VIEW which includes the three payload-tables. + " left outer join (\n" + + " select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + + " union all\n" + + " select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl\n" + + " ) as existing\n" + // Here we access the payload-VIEW which includes the three payload-tables. " on existing.id=p.id and existing.original_url=pu.url\n" + - " where d.allow_harvest=true and existing.id is null\n" + + " where d.allow_harvest=true and existing.id is null\n" + ((excludedDatasourceIDsStringList != null) ? // If we have an exclusion-list, use it below. - (" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") + - " and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" + - " and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + - " and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. - " limit " + (assignmentsLimit * 10) + ")\n" + + (" and d.id not in " + excludedDatasourceIDsStringList + "\n") : "") + + " and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecordAtomic.get() + "\n" + + " and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + + " and pu.url != '' and pu.url is not null\n" + // Some IDs have empty-string urls, there are no "null" urls, but keep the relevant check for future-proofing. + " limit " + (assignmentsLimit * 10) + ")\n" + " as non_distinct_results\n" + " order by coalesce(attempt_count, 0), coalesce(pub_year, 0) desc, reverse(pubid), url\n" + " limit " + assignmentsLimit + ")\n" + @@ -114,7 +116,7 @@ public class UrlsServiceImpl implements UrlsService { // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. //logger.debug("findAssignmentsQuery:\n" + findAssignmentsQuery); // DEBUG! - String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment"; + final String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment"; List assignments = new ArrayList<>(assignmentsLimit); @@ -166,9 +168,8 @@ public class UrlsServiceImpl implements UrlsService { return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } else return ResponseEntity.status(HttpStatus.MULTI_STATUS).body(new AssignmentsResponse((long) -1, null)); - } else if ( assignmentsSize < assignmentsLimit ) { + } else if ( assignmentsSize < assignmentsLimit ) logger.warn("The retrieved results were fewer (" + assignmentsSize + ") than the \"assignmentsLimit\" (" + assignmentsLimit + "), for worker with id: " + workerId + ". Will increase the \"maxAttempts\" to " + maxAttemptsPerRecordAtomic.incrementAndGet() + " for the next requests."); - } logger.debug("Finished gathering " + assignmentsSize + " assignments for worker with id \"" + workerId + "\". Going to insert them into the \"assignment\" table and then return them to the worker."); diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 7e2eba1..7accd4f 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -125,7 +125,7 @@ public class FileUtils { return UploadFullTextsResponse.unsuccessful; } String remoteAddr = request.getHeader("X-FORWARDED-FOR"); - if ( remoteAddr == null || "".equals(remoteAddr) ) + if ( (remoteAddr == null) || "".equals(remoteAddr) ) remoteAddr = request.getRemoteAddr(); // Get the file-locations. diff --git a/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java b/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java index 8a076f7..4060f48 100644 --- a/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java +++ b/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java @@ -31,8 +31,7 @@ public class UriBuilder { baseUrl += sslEnabled.equals("true") ? "s" : ""; baseUrl += "://"; - ip = getPublicIP(); - if ( ip == null ) + if ( (ip = getPublicIP()) == null ) ip = InetAddress.getLoopbackAddress().getHostAddress(); // Non-null. baseUrl += ip + ":" + webServerAppCtxt.getWebServer().getPort(); diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index b8a732e..be9ef81 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -14,7 +14,7 @@ services: db: initialDatabaseName: pdfaggregation_i - testDatabaseName: pdfaggregationdatabase_payloads_view + testDatabaseName: pdfaggregationdatabase_payloads_view_test assignmentLimit: 10000 maxAttemptsPerRecord: 3