diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java index 50b951cbc..eb925f89b 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java @@ -62,6 +62,7 @@ public class ExecuteWorkflow { static int sarcNumberOfIssnToDownload; static boolean finalizeStats; + static boolean cleanDuplicates; static boolean finalTablesVisibleToImpala; static int numberOfDownloadThreads; @@ -176,6 +177,10 @@ public class ExecuteWorkflow { finalizeStats = true; else finalizeStats = false; + if (parser.get("cleanDuplicates").toLowerCase().equals("true")) + cleanDuplicates = true; + else + cleanDuplicates = false; if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) finalTablesVisibleToImpala = true; else diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index 46fc82ee6..8a18c71fc 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -147,6 +147,12 @@ public class UsageStatsExporter { logger.info("Finalized stats"); } + // Clean duplicates from main tables + if (ExecuteWorkflow.cleanDuplicates) { + cleanDuplicates(); + logger.info("Finalized stats"); + } + // Make the tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) { logger.info("Making tables visible to Impala"); @@ -156,6 +162,72 @@ public class UsageStatsExporter { logger.info("End"); } + private void cleanDuplicates() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping downloads_stats_with_duplicates table"); + String dropOldDStatsTmpTable = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_with_duplicates"; + stmt.executeUpdate(dropOldDStatsTmpTable); + + // Give to downloads_stats a temporary name + logger.info("Renaming downloads_stats to downloads_stats_with_duplicates"); + String renameDStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats RENAME TO " + + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates"; + stmt.executeUpdate(renameDStats); + + // Create a new table free from duplicates + logger.info("Creating a downloads_stats table free from duplicates"); + String cleanDStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "AS SELECT DISTINCT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates"; + stmt.executeUpdate(cleanDStats); + + logger.info("Dropping views_stats_with_duplicates table"); + String dropOldVStatsTmpTable = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".views_stats_with_duplicates"; + stmt.executeUpdate(dropOldVStatsTmpTable); + + // Give to views_stats a temporary name + logger.info("Renaming views_stats to views_stats_with_duplicates"); + String renameVStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats RENAME TO " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates"; + stmt.executeUpdate(renameVStats); + + // Create a new table free from duplicates + logger.info("Creating a views_stats table free from duplicates"); + String cleanVStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "AS SELECT DISTINCT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates"; + stmt.executeUpdate(cleanVStats); + + logger.info("Dropping usage_stats_with_duplicates table"); + String dropOldUStatsTmpTable = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".usage_stats_with_duplicates"; + stmt.executeUpdate(dropOldUStatsTmpTable); + + // Give to downloads_stats a temporary name + logger.info("Renaming usage_stats to usage_stats_with_duplicates"); + String renameUStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats RENAME TO " + + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates"; + stmt.executeUpdate(renameUStats); + + // Create a new table free from duplicates + logger.info("Creating a usage_stats table free from duplicates"); + String cleanUStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + + "AS SELECT DISTINCT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates"; + stmt.executeUpdate(cleanUStats); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } + private void invalidateMetadata() throws SQLException { Statement stmt = null; diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json index 988c23b48..f7caf7f29 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json @@ -206,20 +206,25 @@ { "paramName": "inod", "paramLongName": "sarcNumberOfIssnToDownload", - "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", + "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload?", "paramRequired": true }, - { "paramName": "fs", "paramLongName": "finalizeStats", "paramDescription": "Create the usage_stats table?", "paramRequired": true }, + { + "paramName": "cd", + "paramLongName": "cleanDuplicates", + "paramDescription": "Clean duplicates from main Stats tables (usage_stats, views_stats and downloads_stats)?", + "paramRequired": true + }, { "paramName": "ftvi", "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala?", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml index 8d62a85a9..1886dafd9 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml @@ -78,6 +78,7 @@ --sarcProcessStats${sarcProcessStats} --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} + --cleanDuplicates${cleanDuplicates} --finalTablesVisibleToImpala${finalTablesVisibleToImpala} --numberOfDownloadThreads${numberOfDownloadThreads}