diff --git a/dhp-workflows/dhp-usage-datacite-stats-update/src/main/java/eu/dnetlib/oa/graph/dataciteusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datacite-stats-update/src/main/java/eu/dnetlib/oa/graph/dataciteusagestats/export/ReadReportsListFromDatacite.java index ab0844346..f404904de 100755 --- a/dhp-workflows/dhp-usage-datacite-stats-update/src/main/java/eu/dnetlib/oa/graph/dataciteusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datacite-stats-update/src/main/java/eu/dnetlib/oa/graph/dataciteusagestats/export/ReadReportsListFromDatacite.java @@ -397,8 +397,8 @@ public class ReadReportsListFromDatacite { + ".datacite_downloads STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " - + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " - + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on lower(name)=lower(platform) " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on lower(string(ds_type))=lower(od.oid) " + "where metric_type='total-dataset-requests' "; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); @@ -408,8 +408,8 @@ public class ReadReportsListFromDatacite { + ".datacite_views STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " - + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " - + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on lower(name)=lower(platform) " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on lower(string(ds_type))=lower(od.oid) " + "where metric_type='total-dataset-investigations' "; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java index 591e888b5..36f0b42c6 100755 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java @@ -158,7 +158,7 @@ public class LaReferenciaDownloadLogs { // end.add(Calendar.MONTH, +1); // end.add(Calendar.DAY_OF_MONTH, -1); Calendar end = Calendar.getInstance(); - end.add(Calendar.DAY_OF_MONTH, -1); + end.add(Calendar.DAY_OF_MONTH, -3); logger.info("Ending period for log download: " + sdf.format(end.getTime())); @@ -205,7 +205,7 @@ public class LaReferenciaDownloadLogs { true); String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=500&token_auth=" + tokenAuth; + + "&expanded=5&filter_limit=100&token_auth=" + tokenAuth; String content = ""; int i = 0; diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index 27c9d87af..132c300a3 100755 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -228,6 +228,12 @@ public class PiwikDownloadLogs { while (rs.next()) { piwikIdToVisit.add(rs.getInt(1)); } + piwikIdToVisit.add(630); + piwikIdToVisit.add(662); + piwikIdToVisit.add(694); + piwikIdToVisit.add(725); + piwikIdToVisit.add(728); + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index 58909ef4e..955faf98a 100755 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -51,7 +51,6 @@ public class UsageStatsExporter { ConnectDB.init(); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - logger.info("Re-creating database and tables"); if (ExecuteWorkflow.recreateDbAndTables) { piwikstatsdb.recreateDBAndTables(); diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java index 60c4afb30..7cb571a9b 100755 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java @@ -41,8 +41,42 @@ public class LaReferenciaStats { public LaReferenciaStats() throws Exception { } + public void createDistinctLaReferenciaLog() throws Exception { + logger.info("Initialising DB properties"); + ConnectDB.init(); + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping lareferencialogdistinct"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct"; + stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogdistinct"); + + logger.info("Creating lareferencialogdistinct table"); + // Create Piwiklogdistinct table - This table should exist + String sqlCreateTablePiwikLogDistinct = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogdistinct(matomoid INT, source STRING, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLogDistinct); + logger.info("Created lareferencialogdistinct table"); + + logger.info("Inserting data to lareferencialogdistinct"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + + "SELECT DISTINCT * FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".lareferencialog WHERE entity_id is not null"; + stmt.executeUpdate(sql); + logger.info("Inserted data to lareferencialogdistinct"); + } + public void processLogs() throws Exception { try { + logger.info("Creating LareferenciaLogDistinct"); + createDistinctLaReferenciaLog(); + logger.info("LaReferencia creating viewsStats"); viewsStats(); logger.info("LaReferencia created viewsStats"); @@ -76,7 +110,7 @@ public class LaReferenciaStats { "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + "THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='action' and " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='action' and " + "(source_item_type='oaItem' or source_item_type='repItem') " + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; @@ -117,7 +151,7 @@ public class LaReferenciaStats { "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + "THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='download' and " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='download' and " + "(source_item_type='oaItem' or source_item_type='repItem') " + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; @@ -160,7 +194,7 @@ public class LaReferenciaStats { + "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; @@ -192,7 +226,7 @@ public class LaReferenciaStats { + "COUNT(entity_id) AS total_item_investigations, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; @@ -224,7 +258,7 @@ public class LaReferenciaStats { + "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; @@ -256,7 +290,7 @@ public class LaReferenciaStats { + "COUNT(entity_id) AS total_item_requests, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index d20f37363..6e5616e22 100755 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -101,12 +101,43 @@ public class PiwikStatsDB { logger.info("Inserted data to piwiklogdistinct"); } + public void createDistinctEpisciencesLog() throws Exception { + logger.info("Initialising DB properties"); + ConnectDB.init(); + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping episcienceslogdistinct"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".episcienceslogdistinct"; + stmt.executeUpdate(sql); + logger.info("Dropped episcienceslogdistinct"); + + logger.info("Creating episcienceslogdistinct table"); + // Create Piwiklogdistinct table - This table should exist + String sqlCreateTablePiwikLogDistinct = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".episcienceslogdistinct(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLogDistinct); + logger.info("Created episcienceslogdistinct table"); + + logger.info("Inserting data to episcienceslogdistinct"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".episcienceslogdistinct " + + "SELECT DISTINCT * FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".episcienceslog WHERE entity_id is not null"; + stmt.executeUpdate(sql); + logger.info("Inserted data to episcienceslogdistinct"); + } + public void processLogs() throws Exception { try { - logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); - viewsStats(); - logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); +//to remove logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); +// viewsStats(); +//to remove logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); downloadsStats(); @@ -125,6 +156,10 @@ public class PiwikStatsDB { public void processEpisciencesLogs() throws Exception { try { + logger.info("Creating EpisciencesLogDistinct Table"); + createDistinctEpisciencesLog(); + logger.info("Creating EpisciencesLogDistinct Table Created"); + logger.info("Views Episciences processing starts at: " + new Timestamp(System.currentTimeMillis())); episciencesViewsStats(); logger.info("Views Episciences processing ends at: " + new Timestamp(System.currentTimeMillis())); @@ -172,7 +207,7 @@ public class PiwikStatsDB { + ".openaire_views_stats_tmp"; stmt.executeUpdate(drop_views_stats); logger.info("Dropped openaire_views_stats_tmp table"); - +// logger.info("Creating openaire_views_stats_tmp table"); String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp " @@ -187,6 +222,71 @@ public class PiwikStatsDB { stmt.executeUpdate(create_views_stats); logger.info("Created openaire_views_stats_tmp table"); + logger.info("Insert temp missing piwik_ids 630 in openaire_views_stats_tmp table"); + String create_views_stats_missing_id_630 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::cfa5301358b9fcbe7aa45b1ceea088c6' as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=630 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_630); + logger.info("Inserted temp missing piwik_ids 630 in openaire_views_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 662 in openaire_views_stats_tmp table"); + String create_views_stats_missing_id_662 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::4e86eaf2685a67b743a475f86c7c0086' as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=662 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_662); + logger.info("Inserted temp missing piwik_ids 662 in openaire_views_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 694 in openaire_views_stats_tmp table"); + String create_views_stats_missing_id_694 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::f35fd567065af297ae65b621e0a21ae9' as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=694 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_694); + logger.info("Inserted temp missing piwik_ids 694 in openaire_views_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 725 in openaire_views_stats_tmp table"); + String create_views_stats_missing_id_725 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::7180cffd6a8e829dacfc2a31b3f72ece' as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=725 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_725); + logger.info("Inserted temp missing piwik_ids 725 in openaire_views_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 728 in openaire_views_stats_tmp table"); + String create_views_stats_missing_id_728 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::8b3bac12926cc1d9fb5d68783376971d' as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=728 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_728); + logger.info("Inserted temp missing piwik_ids 728 in openaire_views_stats_tmp table"); + logger.info("Creating openaire_pageviews_stats_tmp table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp AS SELECT " @@ -209,27 +309,27 @@ public class PiwikStatsDB { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".openaire_result_downloads_monthly_tmp"; - stmt.executeUpdate(drop_result_downloads_monthly); - logger.info("Dropped openaire_result_downloads_monthly_tmp view"); - - logger.info("Creating openaire_result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_result_downloads_monthly_tmp " - + "AS SELECT entity_id, " - + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," - + "COUNT(entity_id) as downloads, " - + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " - + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogdistinct where action='download' " - + "AND (source_item_type='oaItem' OR source_item_type='repItem') " - + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " - + "ORDER BY source, entity_id, month"; - stmt.executeUpdate(sql); - logger.info("Created openaire_result_downloads_monthly_tmp view"); +//to remove logger.info("Dropping openaire_result_downloads_monthly_tmp view"); +// String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " +// + ConnectDB.getUsageStatsDBSchema() +// + ".openaire_result_downloads_monthly_tmp"; +// stmt.executeUpdate(drop_result_downloads_monthly); +// logger.info("Dropped openaire_result_downloads_monthly_tmp view"); +// +// logger.info("Creating openaire_result_downloads_monthly_tmp view"); +// String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() +// + ".openaire_result_downloads_monthly_tmp " +// + "AS SELECT entity_id, " +// + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," +// + "COUNT(entity_id) as downloads, " +// + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " +// + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +// + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogdistinct where action='download' " +// + "AND (source_item_type='oaItem' OR source_item_type='repItem') " +// + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " +// + "ORDER BY source, entity_id, month"; +// stmt.executeUpdate(sql); +//to remove logger.info("Created openaire_result_downloads_monthly_tmp view"); logger.info("Dropping openaire_downloads_stats_tmp table"); String drop_views_stats = "DROP TABLE IF EXISTS " @@ -239,7 +339,8 @@ public class PiwikStatsDB { logger.info("Dropped openaire_downloads_stats_tmp table"); logger.info("Creating openaire_downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + String sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp AS " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + "max(downloads) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " @@ -251,8 +352,74 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Created downloads_stats table"); + logger.info("Insert temp missing piwik_ids 630 in openaire_downloads_stats_tmp table"); + String create_views_stats_missing_id_630 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::cfa5301358b9fcbe7aa45b1ceea088c6' as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=630 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_630); + logger.info("Inserted temp missing piwik_ids 630 in openaire_downloads_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 662 in openaire_downloads_stats_tmp table"); + String create_views_stats_missing_id_662 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::4e86eaf2685a67b743a475f86c7c0086' as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=662 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_662); + logger.info("Inserted temp missing piwik_ids 662 in openaire_downloads_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 694 in openaire_downloads_stats_tmp table"); + String create_views_stats_missing_id_694 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::f35fd567065af297ae65b621e0a21ae9' as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=694 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_694); + logger.info("Inserted temp missing piwik_ids 694 in openaire_downloads_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 725 in openaire_downloads_stats_tmp table"); + String create_views_stats_missing_id_725 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::7180cffd6a8e829dacfc2a31b3f72ece' as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=725 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_725); + logger.info("Inserted temp missing piwik_ids 725 in openaire_downloads_stats_tmp table"); + + logger.info("Insert temp missing piwik_ids 728 in openaire_downloads_stats_tmp table"); + String create_views_stats_missing_id_728 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp " + + "SELECT 'OpenAIRE' as source, 'opendoar____::8b3bac12926cc1d9fb5d68783376971d' as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=728 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' " + + "GROUP BY ro.id, month " + + "ORDER BY ro.id, month "; + stmt.executeUpdate(create_views_stats_missing_id_728); + logger.info("Inserted temp missing piwik_ids 728 in openaire_downloads_stats_tmp table"); + logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp"; logger.info("Dropped openaire_result_downloads_monthly_tmp view "); stmt.executeUpdate(sql); @@ -298,6 +465,44 @@ public class PiwikStatsDB { } + public void uploadPangaeaLogs() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping Pangaea pangaea_views_stats_tmp table + logger.info("Dropping pangaea_views_stats_tmp table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp"; + logger.info("Dropped pangaea_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping Pangaea pangaea_downloads_stats table + logger.info("Dropping pangaea_downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats"; + logger.info("Dropped pangaea_downloads_stats table "); + stmt.executeUpdate(sql); + + // Creating Pangaea pangaea_views_stats_tmp table + logger.info("Creating Pangaea pangaea_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp AS " + + "SELECT 'PANGAEA' as source, 're3data_____::9633d1e8c4309c833c2c442abeb0cfeb' as repository_id," + + "r.id as result_id,date, cast(count as BIGINT) as count, 0 as openaire " + + "FROM default.pangaeaviews p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.result_id"; + stmt.executeUpdate(sql); + logger.info("Created pangaea_views_stats_tmp table "); + + // Creating Pangaea pangaea_downloads_stats_tmp table + logger.info("Creating Pedocs pangaea_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats_tmp AS " + + "SELECT 'PANGAEA' as source, 're3data_____::9633d1e8c4309c833c2c442abeb0cfeb' as repository_id," + + "r.id as result_id, date, cast(count as BIGINT) as count, 0 as openaire " + + "FROM default.pangaeadownloads p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.result_id"; + stmt.executeUpdate(sql); + logger.info("Created pangaea_downloads_stats_tmp table "); + + } + public void uploadTUDELFTStats() throws Exception { stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); @@ -499,7 +704,8 @@ public class PiwikStatsDB { String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM " + ConnectDB.getStatsDBSchema() + - ".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true)"; + ".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) " + + "and websiteurl!='https://episciences.org/'"; PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION .prepareStatement(returnEpisciencesJournals); @@ -525,12 +731,11 @@ public class PiwikStatsDB { + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + "AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() - + ".episcienceslog where action='action' and (source_item_type='oaItem' or " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + + ".episcienceslogdistinct where action='action' and (source_item_type='oaItem' or " + "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'" + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; - stmt.executeUpdate(create_result_views_monthly); logger.info("Created episciencesSuffix_result_views_monthly_tmp table"); @@ -547,6 +752,23 @@ public class PiwikStatsDB { stmt.executeUpdate(insertIntoEpisciencesViewsTable); logger.info("Inserted episciencesSuffix_result_views_monthly_tmp into EpisciencesViews Table"); + logger + .info( + "Inserting episciencesSuffix_result_views_monthly_tmp for Episciences into EpisciencesViews Table"); + String insertIntoEpisciencesViewsAllTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".episciencesviews SELECT 'Episciences' as source, " + + " 'openaire____::6824b298c96ba906a3e6a70593affbf5' as repository_id, ro.id as result_id, month as date," + + " max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + + "." + episciencesSuffix.replace("-", "_") + "_result_views_monthly_tmp p," + + ConnectDB.getStatsDBSchema() + + ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month"; + logger + .info( + "Inserted episciencesSuffix_result_views_monthly_tmp for Episciences into EpisciencesViews Table"); + + stmt.executeUpdate(insertIntoEpisciencesViewsAllTable); + stmt.executeUpdate(dropepisciencesSuffixView); logger.info("Dropped episciencesSuffix_result_views_monthly_tmp view"); } @@ -576,7 +798,8 @@ public class PiwikStatsDB { String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM " + ConnectDB.getStatsDBSchema() + - ".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true)"; + ".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) " + + "and websiteurl!='https://episciences.org/'"; PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION .prepareStatement(returnEpisciencesJournals); @@ -600,8 +823,8 @@ public class PiwikStatsDB { + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + "AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() - + ".episcienceslog where action='download' and (source_item_type='oaItem' or " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + + ".episcienceslogdistinct where action='download' and (source_item_type='oaItem' or " + "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'" + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; @@ -622,6 +845,23 @@ public class PiwikStatsDB { stmt.executeUpdate(insertIntoEpisciencesDownloadsTable); logger.info("Inserted episciencesSuffix_result_downloads_monthly_tmp into EpisciencesDownloadsTable"); + logger + .info( + "Inserting episciencesSuffix_result_downloads_monthly_tmp for Episciences into EpisciencesDownloadsTable"); + String insertIntoEpisciencesDownloadsAllTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".episciencesdownloads SELECT 'Episciences' as source, " + + " 'openaire____::6824b298c96ba906a3e6a70593affbf5' as repository_id, ro.id as result_id, month as date," + + " max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + + "." + episciencesSuffix.replace("-", "_") + "_result_downloads_monthly_tmp p," + + ConnectDB.getStatsDBSchema() + + ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month"; + + stmt.executeUpdate(insertIntoEpisciencesDownloadsAllTable); + logger + .info( + "Inserted episciencesSuffix_result_downloads_monthly_tmp for Episciences into EpisciencesDownloadsTable"); + stmt.executeUpdate(dropepisciencesSuffixDownloads); logger.info("Dropped episciencesSuffix_result_downloads_monthly_tmp view"); @@ -767,9 +1007,15 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Dropped Table tbl_all_r5_metrics"); - logger.info("Create Table tbl_all_r5_metrics"); + // All CoP R5 metrics Table + logger.info("Drop Table counter_r5_stats_with_metrics"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics "; + stmt.executeUpdate(sql); + logger.info("Dropped Table counter_r5_stats_with_metrics"); + + logger.info("Create Table counter_r5_stats_with_metrics"); sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".tbl_all_r5_metrics as " + + ".counter_r5_stats_with_metrics as " + "WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id, " + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + "coalesce(vs.unique_item_investigations, 0) as unique_item_investigations, " @@ -793,7 +1039,7 @@ public class PiwikStatsDB { + "FROM tmp2 AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".tbl_total_item_requests " + "AS vs ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; stmt.executeUpdate(sql); - logger.info("Created Table tbl_all_r5_metrics"); + logger.info("Created Table counter_r5_stats_with_metrics"); stmt.close(); ConnectDB.getHiveConnection().close(); @@ -857,6 +1103,13 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Pedocs views updated to views_stats"); + // Inserting Pangaea views stats + logger.info("Inserting Pangaea old data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Pangaea views updated to views_stats"); + // Inserting TUDELFT views stats logger.info("Inserting TUDELFT data to views_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " @@ -878,6 +1131,12 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("B2SHARE views updated to views_stats"); + // Inserting Datacite views stats + logger.info("Inserting Datacite views to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Creating downloads_stats table"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() @@ -893,7 +1152,7 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Inserted OpenAIRE data to downloads_stats"); - // Inserting Episciences views stats + // Inserting Episciences downloads stats logger.info("Inserting Episciences data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".episciencesdownloads"; @@ -907,6 +1166,13 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Inserted Pedocs data to downloads_stats"); + // Inserting Pangaea downloads stats + logger.info("Inserting Pangaea old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Pangaea downloads updated to downloads_stats"); + // Inserting TUDELFT downloads stats logger.info("Inserting TUDELFT data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " @@ -920,6 +1186,7 @@ public class PiwikStatsDB { + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp"; stmt.executeUpdate(sql); logger.info("Inserted B2SHARE data to downloads_stats"); + // Inserting Lareferencia downloads stats logger.info("Inserting LaReferencia data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " @@ -934,14 +1201,13 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("IRUS downloads updated to downloads_stats"); - // Inserting IRUS_R5 downloads stats + // Inserting IRUS_R5 views stats logger.info("Inserting IRUS_R5 views to views_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + "SELECT source, repository_id, result_id, `date`, views, openaire FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_R5_stats_tmp"; stmt.executeUpdate(sql); - logger.info("IRUS_R5 views updated to views_stats"); // Inserting IRUS_R5 downloads stats logger.info("Inserting IRUS_R5 data to downloads_stats"); @@ -950,7 +1216,6 @@ public class PiwikStatsDB { + ConnectDB.getUsageStatsDBSchema() + ".irus_R5_stats_tmp"; stmt.executeUpdate(sql); - logger.info("IRUS_R5 downloads updated to downloads_stats"); // Inserting SARC-OJS downloads stats logger.info("Inserting SARC data to downloads_stats"); @@ -959,19 +1224,11 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("SARC-OJS downloads updated to downloads_stats"); - // Inserting Datacite views stats - logger.info("Inserting Datacite views to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " - + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - logger.info("Datacite views updated to views_stats"); - // Inserting Datacite downloads stats logger.info("Inserting Datacite downloads to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_downloads"; stmt.executeUpdate(sql); - logger.info("Datacite downloads updated to downloads_stats"); logger.info("Creating pageviews_stats table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() @@ -986,28 +1243,6 @@ public class PiwikStatsDB { + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; stmt.executeUpdate(sql); - logger.info("Dropping full_dates table"); - String dropFullDates = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".full_dates"; - stmt.executeUpdate(dropFullDates); - logger.info("Dropped full_dates table"); - - Calendar startCalendar = Calendar.getInstance(); - startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); - Calendar endCalendar = Calendar.getInstance(); - int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); - int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); - - logger.info("Creating full_dates table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " - + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " - + "FROM (SELECT DATE '2016-01-01' AS from_date) p " - + "LATERAL VIEW " - + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; - stmt.executeUpdate(sql); - logger.info("Created full_dates table"); - logger.info("Inserting data to usage_stats"); sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + "SELECT coalesce(ds.source, vs.source) as source, " @@ -1022,15 +1257,77 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Inserted data to usage_stats"); + // Dropping project_stats table + logger.info("Dropping project_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".project_stats"; + logger.info("Dropped project_stats table "); + stmt.executeUpdate(sql); + + // Dropping datasource_stats table + logger.info("Dropping datasource_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats"; + logger.info("Dropped datasource_stats table "); + stmt.executeUpdate(sql); + + logger.info("Inserting data to project_downloads"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".project_downloads as " + + " select pr.id, sum(count) downloads, sum(openaire) openaire_downloads,`date` " + + " from " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + " join " + ConnectDB.getStatsDBSchema() + ".project_results pr on result_id=pr.result " + + " join " + ConnectDB.getStatsDBSchema() + ".project p on p.id=pr.id " + + " group by pr.id,`date`"; + stmt.executeUpdate(sql); + logger.info("Inserted data to projects_downloads"); + + logger.info("Inserting data to project_views"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".project_views as " + + " select pr.id, sum(count) views, sum(openaire) openaire_views,`date` " + + " from " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + " join " + ConnectDB.getStatsDBSchema() + ".project_results pr on result_id=pr.result " + + " join " + ConnectDB.getStatsDBSchema() + ".project p on p.id=pr.id " + + " group by pr.id,`date`"; + stmt.executeUpdate(sql); + logger.info("Inserted data to project_views"); + + logger.info("Inserting data to project_stats"); + sql = " CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".project_stats as " + + " SELECT coalesce(pv.id, pd.id) as id, coalesce(pd.`date`, pv.`date`) as `date`, " + + " coalesce(pv.views, 0) as views, coalesce(pd.downloads, 0) as downloads, " + + " coalesce(pv.openaire_views,0) as openaire_views,coalesce(pd.openaire_downloads, 0) as openaire_downloads " + + " FROM " + ConnectDB.getUsageStatsDBSchema() + ".project_downloads pd " + + " FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".project_views pv " + + " ON pd.id=pv.id WHERE pd.`date`=pv.`date`"; + stmt.executeUpdate(sql); + + logger.info("Inserted data to project_stats"); + + logger.info("Inserting data to datasource_stats"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats AS " + + " with datasource_views as " + + " (select repository_id, sum(views) views, sum(openaire_views) openaire_views,`date` " + + " from " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats group by repository_id,`date`), " + + " datasource_downloads as " + + " (select repository_id, sum(downloads) downloads,sum(openaire_downloads) openaire_downloads,`date` " + + " from " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats group by repository_id,`date`)" + + " SELECT coalesce(dv.repository_id, dd.repository_id) as repository_id, coalesce(dd.`date`, dv.`date`) as `date`," + + " coalesce(dv.views, 0) as views, coalesce(dd.downloads, 0) as downloads, " + + " coalesce(dv.openaire_views) as openaire_views,coalesce(dd.openaire_downloads, 0) as openaire_downloads " + + " FROM datasource_downloads dd " + + " FULL OUTER JOIN " + + " datasource_views dv ON dd.repository_id=dv.repository_id WHERE dd.`date`=dv.`date`"; + + stmt.executeUpdate(sql); + logger.info("Inserted data to datasource_stats"); + // Inserting LaReferencia CoP R5 Metrics - logger.info("Inserting Lareferencia data to tbl_all_r5_metrics"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics " + logger.info("Inserting Lareferencia data to counter_r5_stats_with_metrics"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics " + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics"; stmt.executeUpdate(sql); // Inserting IRUS-UK CoP R5 Metrics - logger.info("Inserting IRUS-UK data into tbl_all_r5_metrics"); - String insertΡ5Stats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics " + logger.info("Inserting IRUS-UK data into counter_r5_stats_with_metrics"); + String insertΡ5Stats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics " + "SELECT s.source, d.id AS repository_id, " + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, " + "s.unique_item_investigations , s.total_item_investigations, " @@ -1040,7 +1337,7 @@ public class PiwikStatsDB { + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE s.repository=d.oid AND s.rid=ro.oid AND s.source='IRUS-UK'"; stmt.executeUpdate(insertΡ5Stats); - logger.info("Inserted IRUS-UK data into tbl_all_r5_metrics"); + logger.info("Inserted IRUS-UK data into counter_r5_stats_with_metrics"); logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis())); @@ -1088,6 +1385,28 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Created view on usage_stats on permanent usagestats DB"); + logger.info("Dropping view projects_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".projects_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on projects_stats on permanent usagestats DB"); + + logger.info("Create view on project_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".project_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".project_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on project_stats on permanent usagestats DB"); + + logger.info("Dropping view datasource_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on projects_stats on permanent usagestats DB"); + + logger.info("Create view on datasource_stats on permanent usagestats DB"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on project_stats on permanent usagestats DB"); + logger.info("Dropping view COUNTER_R5_Metrics on permanent usagestats DB"); sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".counter_r5_stats_with_metrics"; stmt.executeUpdate(sql); @@ -1096,7 +1415,7 @@ public class PiwikStatsDB { logger.info("Create view on COUNTER_R5_Metrics on permanent usagestats DB"); sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".counter_r5_stats_with_metrics" - + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics"; + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics"; stmt.executeUpdate(sql); logger.info("Created view on COUNTER_R5_Metrics on permanent usagestats DB"); diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java index 886ebca23..1ba62b03d 100755 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -29,21 +29,16 @@ public class UsageStatsExporter { logger.info("Initialising DB properties"); ConnectDB.init(); -// runImpalaQuery(); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); - logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) { - piwikstatsdb.recreateDBAndTables(); - logger.info("DB-Tables are created "); - } -// else { -// piwikstatsdb.createTmpTables(); -// logger.info("TmpTables are created "); -// } +//to remove logger.info("Re-creating database and tables"); +// if (ExecuteWorkflow.recreateDbAndTables) { +// piwikstatsdb.recreateDBAndTables(); +// logger.info("DB-Tables are created "); +//to remove } if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Creating distinct piwik log"); - piwikstatsdb.createDistinctPiwikLog(); - logger.info("Processing OpenAIRE logs"); +//to remove logger.info("Creating distinct piwik log"); +// piwikstatsdb.createDistinctPiwikLog(); +//to remove logger.info("Processing OpenAIRE logs"); piwikstatsdb.processLogs(); logger.info("OpenAIRE logs Done"); logger.info("Processing Episciences logs"); @@ -52,6 +47,9 @@ public class UsageStatsExporter { logger.info("Processing Pedocs Old Stats"); piwikstatsdb.uploadOldPedocs(); logger.info("Processing Pedocs Old Stats Done"); + logger.info("Processing Pangaea Stats"); + piwikstatsdb.uploadPangaeaLogs(); + logger.info("Processing Pangaea Stats Done"); logger.info("Processing TUDELFT Stats"); piwikstatsdb.uploadTUDELFTStats(); logger.info("Processing TUDELFT Stats Done"); @@ -116,6 +114,18 @@ public class UsageStatsExporter { sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".project_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics"; + stmt.executeUpdate(sql); + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); @@ -125,6 +135,12 @@ public class UsageStatsExporter { sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".project_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats"; + stmt.executeUpdate(sql); + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); diff --git a/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/updateProductionViews.sh index 3e510e87e..2d37ff498 100755 --- a/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/updateProductionViews.sh +++ b/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/updateProductionViews.sh @@ -9,8 +9,34 @@ fi export SOURCE=$1 export PRODUCTION=$2 +#echo "Updating ${PRODUCTION} database" +#impala-shell -q "create database if not exists ${PRODUCTION}" +#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +#echo "Production db ready!" + echo "Updating ${PRODUCTION} database" impala-shell -q "create database if not exists ${PRODUCTION}" -impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - -echo "Production db ready!" \ No newline at end of file +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.views_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.views_stats as SELECT * from ${SOURCE}.views_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.views_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.pageviews_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.pageviews_stats as SELECT * from ${SOURCE}.pageviews_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.pageviews_stats as SELECT * from ${SOURCE}.pageviews_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.pageviews_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.downloads_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.downloads_stats as SELECT * from ${SOURCE}.downloads_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.downloads_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.usage_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.usage_stats as SELECT * from ${SOURCE}.usage_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.usage_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.project_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.project_stats as SELECT * from ${SOURCE}.project_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.project_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.datasource_stats;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.datasource_stats as SELECT * from ${SOURCE}.datasource_stats;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.datasource_stats;" +impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.counter_r5_stats_with_metrics;" +impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.counter_r5_stats_with_metrics as SELECT * from ${SOURCE}.counter_r5_stats_with_metrics;" +impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.counter_r5_stats_with_metrics;" +echo "Production db ready!" diff --git a/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/workflow.xml index 93bad4000..3d6c1d162 100755 --- a/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatspromote/oozie_app/workflow.xml @@ -1,11 +1,11 @@ - usage_stats_db_name + usage_stats_db the target usage stats database name - usage_stats_db_production_name + usage_stats_db_production the name of the public production usage stats database @@ -48,8 +48,8 @@ ${jobTracker} ${nameNode} updateProductionViews.sh - ${usage_stats_db_name} - ${usage_stats_db_production_name} + ${usage_stats_db} + ${usage_stats_db_production} updateProductionViews.sh diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml deleted file mode 100755 index 20d2f5b76..000000000 --- a/dhp-workflows/dhp-usage-stats-update/pom.xml +++ /dev/null @@ -1,91 +0,0 @@ - - - - dhp-workflows - eu.dnetlib.dhp - 1.2.4-SNAPSHOT - - 4.0.0 - dhp-usage-stats-build - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-usage-stats-build -