Optimize the test-DB creation process:

- Use views of the "initialDatabase" view and tables to a) reduce the amount of space used by test-DBs and b) improve test-db creation performance.
- Avoid possible failures from outdated metadata.
This commit is contained in:
Lampros Smyrnaios 2024-03-14 13:10:54 +02:00
parent f61cae41a1
commit b34417dc45
1 changed files with 33 additions and 15 deletions

View File

@ -58,23 +58,29 @@ public class DatabaseConnector {
logger.info("Going to create (if not exist) the TEST-database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from the initial-database \"" + initialDatabaseName + "\"."); logger.info("Going to create (if not exist) the TEST-database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from the initial-database \"" + initialDatabaseName + "\".");
jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS " + testDatabaseName); jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS " + testDatabaseName);
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication stored as parquet as select * from " + initialDatabaseName + ".publication"); try { // Metastore takes some time to recognize the DB has been created, in order to use it later..
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication"); Thread.sleep(1000);
} catch (InterruptedException ignore) {}
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_pids stored as parquet as select * from " + initialDatabaseName + ".publication_pids"); jdbcTemplate.update("INVALIDATE METADATA");
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_pids");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_urls stored as parquet as select * from " + initialDatabaseName + ".publication_urls"); try { // Metastore takes some time to recognize the DB has been created, in order to use it later..
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_urls"); Thread.sleep(1000);
} catch (InterruptedException ignore) {}
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication_boost stored as parquet as select * from " + initialDatabaseName + ".publication_boost"); // Create VIEWs of the original data. We just READ from it, so it's safe for our testing environment..
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".publication_boost");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".datasource stored as parquet as select * from " + initialDatabaseName + ".datasource"); jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".publication as select * from " + initialDatabaseName + ".publication");
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".datasource");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".payload_legacy stored as parquet as select * from " + initialDatabaseName + ".payload_legacy"); jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".publication_pids as select * from " + initialDatabaseName + ".publication_pids");
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".payload_legacy");
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".publication_urls as select * from " + initialDatabaseName + ".publication_urls");
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".publication_boost as select * from " + initialDatabaseName + ".publication_boost");
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".datasource as select * from " + initialDatabaseName + ".datasource");
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + testDatabaseName + ".payload_legacy as select * from " + initialDatabaseName + ".payload_legacy");
databaseName = testDatabaseName; databaseName = testDatabaseName;
} else { } else {
@ -107,9 +113,21 @@ public class DatabaseConnector {
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".payload_bulk_import (id string, original_url string, actual_url string, `date` bigint, mimetype string, size string, `hash` string, `location` string, provenance string) stored as parquet"); jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".payload_bulk_import (id string, original_url string, actual_url string, `date` bigint, mimetype string, size string, `hash` string, `location` string, provenance string) stored as parquet");
jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".payload_bulk_import"); jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".payload_bulk_import");
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + databaseName + ".payload " +
"AS SELECT * from " + databaseName + ".payload_legacy " + try { // Metastore takes some time to recognize the tables have been created, in order to use them in the view.
"UNION ALL SELECT * FROM " + databaseName +".payload_aggregated " + Thread.sleep(1000);
} catch (InterruptedException ignore) {}
jdbcTemplate.update("INVALIDATE METADATA");
try { // Metastore takes some time to recognize the tables have been created, in order to use them in the view.
Thread.sleep(1000);
} catch (InterruptedException ignore) {}
jdbcTemplate.execute("CREATE VIEW IF NOT EXISTS " + databaseName + ".payload\n" +
"AS SELECT * from " + databaseName + ".payload_legacy\n" +
"UNION ALL SELECT * FROM " + databaseName +".payload_aggregated\n" +
"UNION ALL SELECT * FROM " + databaseName + ".payload_bulk_import"); "UNION ALL SELECT * FROM " + databaseName + ".payload_bulk_import");
// We do not do the "compute stats" for the view, since we get the following error: "COMPUTE STATS not supported for view: pdfaggregationdatabase_payloads_view.payload". // We do not do the "compute stats" for the view, since we get the following error: "COMPUTE STATS not supported for view: pdfaggregationdatabase_payloads_view.payload".