diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java
index 50b951cbc..eb925f89b 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java
@@ -62,6 +62,7 @@ public class ExecuteWorkflow {
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats;
+ static boolean cleanDuplicates;
static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads;
@@ -176,6 +177,10 @@ public class ExecuteWorkflow {
finalizeStats = true;
else
finalizeStats = false;
+ if (parser.get("cleanDuplicates").toLowerCase().equals("true"))
+ cleanDuplicates = true;
+ else
+ cleanDuplicates = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
finalTablesVisibleToImpala = true;
else
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
index 46fc82ee6..8a18c71fc 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
@@ -147,6 +147,12 @@ public class UsageStatsExporter {
logger.info("Finalized stats");
}
+ // Clean duplicates from main tables
+ if (ExecuteWorkflow.cleanDuplicates) {
+ cleanDuplicates();
+ logger.info("Finalized stats");
+ }
+
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
@@ -156,6 +162,72 @@ public class UsageStatsExporter {
logger.info("End");
}
+ private void cleanDuplicates() throws SQLException {
+ Statement stmt = null;
+
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping downloads_stats_with_duplicates table");
+ String dropOldDStatsTmpTable = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".downloads_stats_with_duplicates";
+ stmt.executeUpdate(dropOldDStatsTmpTable);
+
+ // Give to downloads_stats a temporary name
+ logger.info("Renaming downloads_stats to downloads_stats_with_duplicates");
+ String renameDStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats RENAME TO " +
+ ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
+ stmt.executeUpdate(renameDStats);
+
+ // Create a new table free from duplicates
+ logger.info("Creating a downloads_stats table free from duplicates");
+ String cleanDStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
+ "AS SELECT DISTINCT * " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_with_duplicates";
+ stmt.executeUpdate(cleanDStats);
+
+ logger.info("Dropping views_stats_with_duplicates table");
+ String dropOldVStatsTmpTable = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".views_stats_with_duplicates";
+ stmt.executeUpdate(dropOldVStatsTmpTable);
+
+ // Give to views_stats a temporary name
+ logger.info("Renaming views_stats to views_stats_with_duplicates");
+ String renameVStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats RENAME TO " +
+ ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
+ stmt.executeUpdate(renameVStats);
+
+ // Create a new table free from duplicates
+ logger.info("Creating a views_stats table free from duplicates");
+ String cleanVStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
+ "AS SELECT DISTINCT * " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_with_duplicates";
+ stmt.executeUpdate(cleanVStats);
+
+ logger.info("Dropping usage_stats_with_duplicates table");
+ String dropOldUStatsTmpTable = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".usage_stats_with_duplicates";
+ stmt.executeUpdate(dropOldUStatsTmpTable);
+
+ // Give to downloads_stats a temporary name
+ logger.info("Renaming usage_stats to usage_stats_with_duplicates");
+ String renameUStats = "ALTER TABLE " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats RENAME TO " +
+ ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
+ stmt.executeUpdate(renameUStats);
+
+ // Create a new table free from duplicates
+ logger.info("Creating a usage_stats table free from duplicates");
+ String cleanUStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
+ "AS SELECT DISTINCT * " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats_with_duplicates";
+ stmt.executeUpdate(cleanUStats);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ }
+
private void invalidateMetadata() throws SQLException {
Statement stmt = null;
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json
index 988c23b48..f7caf7f29 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json
@@ -206,20 +206,25 @@
{
"paramName": "inod",
"paramLongName": "sarcNumberOfIssnToDownload",
- "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
+ "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload?",
"paramRequired": true
},
-
{
"paramName": "fs",
"paramLongName": "finalizeStats",
"paramDescription": "Create the usage_stats table?",
"paramRequired": true
},
+ {
+ "paramName": "cd",
+ "paramLongName": "cleanDuplicates",
+ "paramDescription": "Clean duplicates from main Stats tables (usage_stats, views_stats and downloads_stats)?",
+ "paramRequired": true
+ },
{
"paramName": "ftvi",
"paramLongName": "finalTablesVisibleToImpala",
- "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
+ "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala?",
"paramRequired": true
},
{
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml
index 8d62a85a9..1886dafd9 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml
@@ -78,6 +78,7 @@
--sarcProcessStats${sarcProcessStats}
--sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
--finalizeStats${finalizeStats}
+ --cleanDuplicates${cleanDuplicates}
--finalTablesVisibleToImpala${finalTablesVisibleToImpala}
--numberOfDownloadThreads${numberOfDownloadThreads}