forked from D-Net/dnet-hadoop
Duplicate elimination from usage_stats, views_stats and downloads_stats tables
This commit is contained in:
parent
6ff5436991
commit
cfea668999
|
@ -62,6 +62,7 @@ public class ExecuteWorkflow {
|
|||
static int sarcNumberOfIssnToDownload;
|
||||
|
||||
static boolean finalizeStats;
|
||||
static boolean cleanDuplicates;
|
||||
static boolean finalTablesVisibleToImpala;
|
||||
|
||||
static int numberOfDownloadThreads;
|
||||
|
@ -176,6 +177,10 @@ public class ExecuteWorkflow {
|
|||
finalizeStats = true;
|
||||
else
|
||||
finalizeStats = false;
|
||||
if (parser.get("cleanDuplicates").toLowerCase().equals("true"))
|
||||
cleanDuplicates = true;
|
||||
else
|
||||
cleanDuplicates = false;
|
||||
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
|
||||
finalTablesVisibleToImpala = true;
|
||||
else
|
||||
|
|
|
@ -147,6 +147,12 @@ public class UsageStatsExporter {
|
|||
logger.info("Finalized stats");
|
||||
}
|
||||
|
||||
// Clean duplicates from main tables
|
||||
if (ExecuteWorkflow.cleanDuplicates) {
|
||||
cleanDuplicates();
|
||||
logger.info("Finalized stats");
|
||||
}
|
||||
|
||||
// Make the tables available to Impala
|
||||
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
|
||||
logger.info("Making tables visible to Impala");
|
||||
|
@ -156,6 +162,72 @@ public class UsageStatsExporter {
|
|||
logger.info("End");
|
||||
}
|
||||
|
||||
private void cleanDuplicates() throws SQLException {
|
||||
Statement stmt = null;
|
||||
|
||||
stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
|
||||
logger.info("Dropping downloads_stats_with_duplicates table");
|
||||
String dropOldDStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".downloads_stats_with_duplicates";
|
||||
stmt.executeUpdate(dropOldDStatsTmpTable);
|
||||
|
||||
// Give to downloads_stats a temporary name
|
||||
logger.info("Renaming downloads_stats to downloads_stats_with_duplicates");
|
||||
String renameDStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats RENAME TO " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
|
||||
stmt.executeUpdate(renameDStats);
|
||||
|
||||
// Create a new table free from duplicates
|
||||
logger.info("Creating a downloads_stats table free from duplicates");
|
||||
String cleanDStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
|
||||
"AS SELECT DISTINCT * " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
|
||||
stmt.executeUpdate(cleanDStats);
|
||||
|
||||
logger.info("Dropping views_stats_with_duplicates table");
|
||||
String dropOldVStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".views_stats_with_duplicates";
|
||||
stmt.executeUpdate(dropOldVStatsTmpTable);
|
||||
|
||||
// Give to views_stats a temporary name
|
||||
logger.info("Renaming views_stats to views_stats_with_duplicates");
|
||||
String renameVStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats RENAME TO " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
|
||||
stmt.executeUpdate(renameVStats);
|
||||
|
||||
// Create a new table free from duplicates
|
||||
logger.info("Creating a views_stats table free from duplicates");
|
||||
String cleanVStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
|
||||
"AS SELECT DISTINCT * " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
|
||||
stmt.executeUpdate(cleanVStats);
|
||||
|
||||
logger.info("Dropping usage_stats_with_duplicates table");
|
||||
String dropOldUStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".usage_stats_with_duplicates";
|
||||
stmt.executeUpdate(dropOldUStatsTmpTable);
|
||||
|
||||
// Give to downloads_stats a temporary name
|
||||
logger.info("Renaming usage_stats to usage_stats_with_duplicates");
|
||||
String renameUStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats RENAME TO " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
|
||||
stmt.executeUpdate(renameUStats);
|
||||
|
||||
// Create a new table free from duplicates
|
||||
logger.info("Creating a usage_stats table free from duplicates");
|
||||
String cleanUStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
|
||||
"AS SELECT DISTINCT * " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
|
||||
stmt.executeUpdate(cleanUStats);
|
||||
|
||||
stmt.close();
|
||||
ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
|
||||
private void invalidateMetadata() throws SQLException {
|
||||
Statement stmt = null;
|
||||
|
||||
|
|
|
@ -206,20 +206,25 @@
|
|||
{
|
||||
"paramName": "inod",
|
||||
"paramLongName": "sarcNumberOfIssnToDownload",
|
||||
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
|
||||
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload?",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "fs",
|
||||
"paramLongName": "finalizeStats",
|
||||
"paramDescription": "Create the usage_stats table?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "cd",
|
||||
"paramLongName": "cleanDuplicates",
|
||||
"paramDescription": "Clean duplicates from main Stats tables (usage_stats, views_stats and downloads_stats)?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ftvi",
|
||||
"paramLongName": "finalTablesVisibleToImpala",
|
||||
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
|
||||
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -78,6 +78,7 @@
|
|||
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
|
||||
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
|
||||
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
|
||||
<arg>--cleanDuplicates</arg><arg>${cleanDuplicates}</arg>
|
||||
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
|
||||
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
|
||||
<capture-output/>
|
||||
|
|
Loading…
Reference in New Issue