Duplicate elimination from usage_stats, views_stats and downloads_stats tables

This commit is contained in:
Spyros Zoupanos 2020-10-25 13:07:05 +02:00
parent 6ff5436991
commit cfea668999
4 changed files with 86 additions and 3 deletions

View File

@ -62,6 +62,7 @@ public class ExecuteWorkflow {
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats;
static boolean cleanDuplicates;
static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads;
@ -176,6 +177,10 @@ public class ExecuteWorkflow {
finalizeStats = true;
else
finalizeStats = false;
if (parser.get("cleanDuplicates").toLowerCase().equals("true"))
cleanDuplicates = true;
else
cleanDuplicates = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
finalTablesVisibleToImpala = true;
else

View File

@ -147,6 +147,12 @@ public class UsageStatsExporter {
logger.info("Finalized stats");
}
// Clean duplicates from main tables
if (ExecuteWorkflow.cleanDuplicates) {
cleanDuplicates();
logger.info("Finalized stats");
}
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
@ -156,6 +162,72 @@ public class UsageStatsExporter {
logger.info("End");
}
private void cleanDuplicates() throws SQLException {
Statement stmt = null;
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping downloads_stats_with_duplicates table");
String dropOldDStatsTmpTable = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".downloads_stats_with_duplicates";
stmt.executeUpdate(dropOldDStatsTmpTable);
// Give to downloads_stats a temporary name
logger.info("Renaming downloads_stats to downloads_stats_with_duplicates");
String renameDStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats RENAME TO " +
ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
stmt.executeUpdate(renameDStats);
// Create a new table free from duplicates
logger.info("Creating a downloads_stats table free from duplicates");
String cleanDStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"AS SELECT DISTINCT * " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
stmt.executeUpdate(cleanDStats);
logger.info("Dropping views_stats_with_duplicates table");
String dropOldVStatsTmpTable = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".views_stats_with_duplicates";
stmt.executeUpdate(dropOldVStatsTmpTable);
// Give to views_stats a temporary name
logger.info("Renaming views_stats to views_stats_with_duplicates");
String renameVStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats RENAME TO " +
ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
stmt.executeUpdate(renameVStats);
// Create a new table free from duplicates
logger.info("Creating a views_stats table free from duplicates");
String cleanVStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"AS SELECT DISTINCT * " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
stmt.executeUpdate(cleanVStats);
logger.info("Dropping usage_stats_with_duplicates table");
String dropOldUStatsTmpTable = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".usage_stats_with_duplicates";
stmt.executeUpdate(dropOldUStatsTmpTable);
// Give to downloads_stats a temporary name
logger.info("Renaming usage_stats to usage_stats_with_duplicates");
String renameUStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats RENAME TO " +
ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
stmt.executeUpdate(renameUStats);
// Create a new table free from duplicates
logger.info("Creating a usage_stats table free from duplicates");
String cleanUStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
"AS SELECT DISTINCT * " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
stmt.executeUpdate(cleanUStats);
stmt.close();
ConnectDB.getHiveConnection().close();
}
private void invalidateMetadata() throws SQLException {
Statement stmt = null;

View File

@ -206,20 +206,25 @@
{
"paramName": "inod",
"paramLongName": "sarcNumberOfIssnToDownload",
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload?",
"paramRequired": true
},
{
"paramName": "fs",
"paramLongName": "finalizeStats",
"paramDescription": "Create the usage_stats table?",
"paramRequired": true
},
{
"paramName": "cd",
"paramLongName": "cleanDuplicates",
"paramDescription": "Clean duplicates from main Stats tables (usage_stats, views_stats and downloads_stats)?",
"paramRequired": true
},
{
"paramName": "ftvi",
"paramLongName": "finalTablesVisibleToImpala",
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala?",
"paramRequired": true
},
{

View File

@ -78,6 +78,7 @@
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
<arg>--cleanDuplicates</arg><arg>${cleanDuplicates}</arg>
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
<capture-output/>