forked from D-Net/dnet-hadoop
Duplicate elimination from usage_stats, views_stats and downloads_stats tables
This commit is contained in:
parent
6ff5436991
commit
cfea668999
|
@ -62,6 +62,7 @@ public class ExecuteWorkflow {
|
||||||
static int sarcNumberOfIssnToDownload;
|
static int sarcNumberOfIssnToDownload;
|
||||||
|
|
||||||
static boolean finalizeStats;
|
static boolean finalizeStats;
|
||||||
|
static boolean cleanDuplicates;
|
||||||
static boolean finalTablesVisibleToImpala;
|
static boolean finalTablesVisibleToImpala;
|
||||||
|
|
||||||
static int numberOfDownloadThreads;
|
static int numberOfDownloadThreads;
|
||||||
|
@ -176,6 +177,10 @@ public class ExecuteWorkflow {
|
||||||
finalizeStats = true;
|
finalizeStats = true;
|
||||||
else
|
else
|
||||||
finalizeStats = false;
|
finalizeStats = false;
|
||||||
|
if (parser.get("cleanDuplicates").toLowerCase().equals("true"))
|
||||||
|
cleanDuplicates = true;
|
||||||
|
else
|
||||||
|
cleanDuplicates = false;
|
||||||
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
|
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
|
||||||
finalTablesVisibleToImpala = true;
|
finalTablesVisibleToImpala = true;
|
||||||
else
|
else
|
||||||
|
|
|
@ -147,6 +147,12 @@ public class UsageStatsExporter {
|
||||||
logger.info("Finalized stats");
|
logger.info("Finalized stats");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean duplicates from main tables
|
||||||
|
if (ExecuteWorkflow.cleanDuplicates) {
|
||||||
|
cleanDuplicates();
|
||||||
|
logger.info("Finalized stats");
|
||||||
|
}
|
||||||
|
|
||||||
// Make the tables available to Impala
|
// Make the tables available to Impala
|
||||||
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
|
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
|
||||||
logger.info("Making tables visible to Impala");
|
logger.info("Making tables visible to Impala");
|
||||||
|
@ -156,6 +162,72 @@ public class UsageStatsExporter {
|
||||||
logger.info("End");
|
logger.info("End");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void cleanDuplicates() throws SQLException {
|
||||||
|
Statement stmt = null;
|
||||||
|
|
||||||
|
stmt = ConnectDB.getHiveConnection().createStatement();
|
||||||
|
|
||||||
|
logger.info("Dropping downloads_stats_with_duplicates table");
|
||||||
|
String dropOldDStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() +
|
||||||
|
".downloads_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(dropOldDStatsTmpTable);
|
||||||
|
|
||||||
|
// Give to downloads_stats a temporary name
|
||||||
|
logger.info("Renaming downloads_stats to downloads_stats_with_duplicates");
|
||||||
|
String renameDStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats RENAME TO " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(renameDStats);
|
||||||
|
|
||||||
|
// Create a new table free from duplicates
|
||||||
|
logger.info("Creating a downloads_stats table free from duplicates");
|
||||||
|
String cleanDStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
|
||||||
|
"AS SELECT DISTINCT * " +
|
||||||
|
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(cleanDStats);
|
||||||
|
|
||||||
|
logger.info("Dropping views_stats_with_duplicates table");
|
||||||
|
String dropOldVStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() +
|
||||||
|
".views_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(dropOldVStatsTmpTable);
|
||||||
|
|
||||||
|
// Give to views_stats a temporary name
|
||||||
|
logger.info("Renaming views_stats to views_stats_with_duplicates");
|
||||||
|
String renameVStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats RENAME TO " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(renameVStats);
|
||||||
|
|
||||||
|
// Create a new table free from duplicates
|
||||||
|
logger.info("Creating a views_stats table free from duplicates");
|
||||||
|
String cleanVStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
|
||||||
|
"AS SELECT DISTINCT * " +
|
||||||
|
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(cleanVStats);
|
||||||
|
|
||||||
|
logger.info("Dropping usage_stats_with_duplicates table");
|
||||||
|
String dropOldUStatsTmpTable = "DROP TABLE IF EXISTS " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() +
|
||||||
|
".usage_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(dropOldUStatsTmpTable);
|
||||||
|
|
||||||
|
// Give to downloads_stats a temporary name
|
||||||
|
logger.info("Renaming usage_stats to usage_stats_with_duplicates");
|
||||||
|
String renameUStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats RENAME TO " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(renameUStats);
|
||||||
|
|
||||||
|
// Create a new table free from duplicates
|
||||||
|
logger.info("Creating a usage_stats table free from duplicates");
|
||||||
|
String cleanUStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
|
||||||
|
"AS SELECT DISTINCT * " +
|
||||||
|
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
|
||||||
|
stmt.executeUpdate(cleanUStats);
|
||||||
|
|
||||||
|
stmt.close();
|
||||||
|
ConnectDB.getHiveConnection().close();
|
||||||
|
}
|
||||||
|
|
||||||
private void invalidateMetadata() throws SQLException {
|
private void invalidateMetadata() throws SQLException {
|
||||||
Statement stmt = null;
|
Statement stmt = null;
|
||||||
|
|
||||||
|
|
|
@ -206,20 +206,25 @@
|
||||||
{
|
{
|
||||||
"paramName": "inod",
|
"paramName": "inod",
|
||||||
"paramLongName": "sarcNumberOfIssnToDownload",
|
"paramLongName": "sarcNumberOfIssnToDownload",
|
||||||
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
|
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload?",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"paramName": "fs",
|
"paramName": "fs",
|
||||||
"paramLongName": "finalizeStats",
|
"paramLongName": "finalizeStats",
|
||||||
"paramDescription": "Create the usage_stats table?",
|
"paramDescription": "Create the usage_stats table?",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "cd",
|
||||||
|
"paramLongName": "cleanDuplicates",
|
||||||
|
"paramDescription": "Clean duplicates from main Stats tables (usage_stats, views_stats and downloads_stats)?",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "ftvi",
|
"paramName": "ftvi",
|
||||||
"paramLongName": "finalTablesVisibleToImpala",
|
"paramLongName": "finalTablesVisibleToImpala",
|
||||||
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
|
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala?",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -78,6 +78,7 @@
|
||||||
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
|
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
|
||||||
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
|
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
|
||||||
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
|
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
|
||||||
|
<arg>--cleanDuplicates</arg><arg>${cleanDuplicates}</arg>
|
||||||
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
|
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
|
||||||
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
|
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
|
||||||
<capture-output/>
|
<capture-output/>
|
||||||
|
|
Loading…
Reference in New Issue