Last commit

This commit is contained in:
dimitrispie 2024-01-17 18:02:33 +02:00
parent 6b247524a8
commit 22eaf211e8
10 changed files with 510 additions and 201 deletions

View File

@ -397,8 +397,8 @@ public class ReadReportsListFromDatacite {
+ ".datacite_downloads STORED AS PARQUET as "
+ "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire "
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on lower(name)=lower(platform) "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on lower(string(ds_type))=lower(od.oid) "
+ "where metric_type='total-dataset-requests' ";
stmt.executeUpdate(createDownloadsTable);
logger.info("Downloads Stats table created");
@ -408,8 +408,8 @@ public class ReadReportsListFromDatacite {
+ ".datacite_views STORED AS PARQUET as "
+ "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire "
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on lower(name)=lower(platform) "
+ "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on lower(string(ds_type))=lower(od.oid) "
+ "where metric_type='total-dataset-investigations' ";
stmt.executeUpdate(createViewsTable);
logger.info("Views Stats table created");

View File

@ -158,7 +158,7 @@ public class LaReferenciaDownloadLogs {
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
end.add(Calendar.DAY_OF_MONTH, -3);
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
@ -205,7 +205,7 @@ public class LaReferenciaDownloadLogs {
true);
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
+ "&expanded=5&filter_limit=500&token_auth=" + tokenAuth;
+ "&expanded=5&filter_limit=100&token_auth=" + tokenAuth;
String content = "";
int i = 0;

View File

@ -228,6 +228,12 @@ public class PiwikDownloadLogs {
while (rs.next()) {
piwikIdToVisit.add(rs.getInt(1));
}
piwikIdToVisit.add(630);
piwikIdToVisit.add(662);
piwikIdToVisit.add(694);
piwikIdToVisit.add(725);
piwikIdToVisit.add(728);
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0

View File

@ -51,7 +51,6 @@ public class UsageStatsExporter {
ConnectDB.init();
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Re-creating database and tables");
if (ExecuteWorkflow.recreateDbAndTables) {
piwikstatsdb.recreateDBAndTables();

View File

@ -41,8 +41,42 @@ public class LaReferenciaStats {
public LaReferenciaStats() throws Exception {
}
public void createDistinctLaReferenciaLog() throws Exception {
logger.info("Initialising DB properties");
ConnectDB.init();
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping lareferencialogdistinct");
String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct";
stmt.executeUpdate(sql);
logger.info("Dropped lareferencialogdistinct");
logger.info("Creating lareferencialogdistinct table");
// Create Piwiklogdistinct table - This table should exist
String sqlCreateTablePiwikLogDistinct = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".lareferencialogdistinct(matomoid INT, source STRING, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) "
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLogDistinct);
logger.info("Created lareferencialogdistinct table");
logger.info("Inserting data to lareferencialogdistinct");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct "
+ "SELECT DISTINCT * FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".lareferencialog WHERE entity_id is not null";
stmt.executeUpdate(sql);
logger.info("Inserted data to lareferencialogdistinct");
}
public void processLogs() throws Exception {
try {
logger.info("Creating LareferenciaLogDistinct");
createDistinctLaReferenciaLog();
logger.info("LaReferencia creating viewsStats");
viewsStats();
logger.info("LaReferencia created viewsStats");
@ -76,7 +110,7 @@ public class LaReferenciaStats {
"SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='action' and " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='action' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
@ -117,7 +151,7 @@ public class LaReferenciaStats {
"SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='download' and " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='download' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
@ -160,7 +194,7 @@ public class LaReferenciaStats {
+ "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct "
+ "WHERE (source_item_type='oaItem' or source_item_type='repItem') "
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
@ -192,7 +226,7 @@ public class LaReferenciaStats {
+ "COUNT(entity_id) AS total_item_investigations, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct "
+ "WHERE (source_item_type='oaItem' or source_item_type='repItem') "
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
@ -224,7 +258,7 @@ public class LaReferenciaStats {
+ "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct "
+ "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') "
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
@ -256,7 +290,7 @@ public class LaReferenciaStats {
+ "COUNT(entity_id) AS total_item_requests, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct "
+ "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') "
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";

View File

@ -101,12 +101,43 @@ public class PiwikStatsDB {
logger.info("Inserted data to piwiklogdistinct");
}
public void createDistinctEpisciencesLog() throws Exception {
logger.info("Initialising DB properties");
ConnectDB.init();
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping episcienceslogdistinct");
String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".episcienceslogdistinct";
stmt.executeUpdate(sql);
logger.info("Dropped episcienceslogdistinct");
logger.info("Creating episcienceslogdistinct table");
// Create Piwiklogdistinct table - This table should exist
String sqlCreateTablePiwikLogDistinct = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".episcienceslogdistinct(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) "
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLogDistinct);
logger.info("Created episcienceslogdistinct table");
logger.info("Inserting data to episcienceslogdistinct");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".episcienceslogdistinct "
+ "SELECT DISTINCT * FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".episcienceslog WHERE entity_id is not null";
stmt.executeUpdate(sql);
logger.info("Inserted data to episcienceslogdistinct");
}
public void processLogs() throws Exception {
try {
logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
viewsStats();
logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
//to remove logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
// viewsStats();
//to remove logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
downloadsStats();
@ -125,6 +156,10 @@ public class PiwikStatsDB {
public void processEpisciencesLogs() throws Exception {
try {
logger.info("Creating EpisciencesLogDistinct Table");
createDistinctEpisciencesLog();
logger.info("Creating EpisciencesLogDistinct Table Created");
logger.info("Views Episciences processing starts at: " + new Timestamp(System.currentTimeMillis()));
episciencesViewsStats();
logger.info("Views Episciences processing ends at: " + new Timestamp(System.currentTimeMillis()));
@ -172,7 +207,7 @@ public class PiwikStatsDB {
+ ".openaire_views_stats_tmp";
stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_views_stats_tmp table");
//
logger.info("Creating openaire_views_stats_tmp table");
String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
@ -187,6 +222,71 @@ public class PiwikStatsDB {
stmt.executeUpdate(create_views_stats);
logger.info("Created openaire_views_stats_tmp table");
logger.info("Insert temp missing piwik_ids 630 in openaire_views_stats_tmp table");
String create_views_stats_missing_id_630 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::cfa5301358b9fcbe7aa45b1ceea088c6' as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=630 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_630);
logger.info("Inserted temp missing piwik_ids 630 in openaire_views_stats_tmp table");
logger.info("Insert temp missing piwik_ids 662 in openaire_views_stats_tmp table");
String create_views_stats_missing_id_662 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::4e86eaf2685a67b743a475f86c7c0086' as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=662 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_662);
logger.info("Inserted temp missing piwik_ids 662 in openaire_views_stats_tmp table");
logger.info("Insert temp missing piwik_ids 694 in openaire_views_stats_tmp table");
String create_views_stats_missing_id_694 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::f35fd567065af297ae65b621e0a21ae9' as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=694 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_694);
logger.info("Inserted temp missing piwik_ids 694 in openaire_views_stats_tmp table");
logger.info("Insert temp missing piwik_ids 725 in openaire_views_stats_tmp table");
String create_views_stats_missing_id_725 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::7180cffd6a8e829dacfc2a31b3f72ece' as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=725 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_725);
logger.info("Inserted temp missing piwik_ids 725 in openaire_views_stats_tmp table");
logger.info("Insert temp missing piwik_ids 728 in openaire_views_stats_tmp table");
String create_views_stats_missing_id_728 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::8b3bac12926cc1d9fb5d68783376971d' as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=728 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_728);
logger.info("Inserted temp missing piwik_ids 728 in openaire_views_stats_tmp table");
logger.info("Creating openaire_pageviews_stats_tmp table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_pageviews_stats_tmp AS SELECT "
@ -209,27 +309,27 @@ public class PiwikStatsDB {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
String drop_result_downloads_monthly = "DROP VIEW IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_downloads_monthly_tmp";
stmt.executeUpdate(drop_result_downloads_monthly);
logger.info("Dropped openaire_result_downloads_monthly_tmp view");
logger.info("Creating openaire_result_downloads_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_downloads_monthly_tmp "
+ "AS SELECT entity_id, "
+ "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
+ "COUNT(entity_id) as downloads, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogdistinct where action='download' "
+ "AND (source_item_type='oaItem' OR source_item_type='repItem') "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source "
+ "ORDER BY source, entity_id, month";
stmt.executeUpdate(sql);
logger.info("Created openaire_result_downloads_monthly_tmp view");
//to remove logger.info("Dropping openaire_result_downloads_monthly_tmp view");
// String drop_result_downloads_monthly = "DROP VIEW IF EXISTS "
// + ConnectDB.getUsageStatsDBSchema()
// + ".openaire_result_downloads_monthly_tmp";
// stmt.executeUpdate(drop_result_downloads_monthly);
// logger.info("Dropped openaire_result_downloads_monthly_tmp view");
//
// logger.info("Creating openaire_result_downloads_monthly_tmp view");
// String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
// + ".openaire_result_downloads_monthly_tmp "
// + "AS SELECT entity_id, "
// + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
// + "COUNT(entity_id) as downloads, "
// + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
// + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
// + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogdistinct where action='download' "
// + "AND (source_item_type='oaItem' OR source_item_type='repItem') "
// + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source "
// + "ORDER BY source, entity_id, month";
// stmt.executeUpdate(sql);
//to remove logger.info("Created openaire_result_downloads_monthly_tmp view");
logger.info("Dropping openaire_downloads_stats_tmp table");
String drop_views_stats = "DROP TABLE IF EXISTS "
@ -239,7 +339,8 @@ public class PiwikStatsDB {
logger.info("Dropped openaire_downloads_stats_tmp table");
logger.info("Creating openaire_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS "
String sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
@ -251,8 +352,74 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Created downloads_stats table");
logger.info("Insert temp missing piwik_ids 630 in openaire_downloads_stats_tmp table");
String create_views_stats_missing_id_630 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::cfa5301358b9fcbe7aa45b1ceea088c6' as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=630 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_630);
logger.info("Inserted temp missing piwik_ids 630 in openaire_downloads_stats_tmp table");
logger.info("Insert temp missing piwik_ids 662 in openaire_downloads_stats_tmp table");
String create_views_stats_missing_id_662 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::4e86eaf2685a67b743a475f86c7c0086' as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=662 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_662);
logger.info("Inserted temp missing piwik_ids 662 in openaire_downloads_stats_tmp table");
logger.info("Insert temp missing piwik_ids 694 in openaire_downloads_stats_tmp table");
String create_views_stats_missing_id_694 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::f35fd567065af297ae65b621e0a21ae9' as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=694 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_694);
logger.info("Inserted temp missing piwik_ids 694 in openaire_downloads_stats_tmp table");
logger.info("Insert temp missing piwik_ids 725 in openaire_downloads_stats_tmp table");
String create_views_stats_missing_id_725 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::7180cffd6a8e829dacfc2a31b3f72ece' as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=725 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_725);
logger.info("Inserted temp missing piwik_ids 725 in openaire_downloads_stats_tmp table");
logger.info("Insert temp missing piwik_ids 728 in openaire_downloads_stats_tmp table");
String create_views_stats_missing_id_728 = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::8b3bac12926cc1d9fb5d68783376971d' as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=728 AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503' "
+ "GROUP BY ro.id, month "
+ "ORDER BY ro.id, month ";
stmt.executeUpdate(create_views_stats_missing_id_728);
logger.info("Inserted temp missing piwik_ids 728 in openaire_downloads_stats_tmp table");
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp";
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_downloads_monthly_tmp";
logger.info("Dropped openaire_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
@ -298,6 +465,44 @@ public class PiwikStatsDB {
}
public void uploadPangaeaLogs() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
// Dropping Pangaea pangaea_views_stats_tmp table
logger.info("Dropping pangaea_views_stats_tmp table");
String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp";
logger.info("Dropped pangaea_views_stats_tmp table ");
stmt.executeUpdate(sql);
// Dropping Pangaea pangaea_downloads_stats table
logger.info("Dropping pangaea_downloads_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats";
logger.info("Dropped pangaea_downloads_stats table ");
stmt.executeUpdate(sql);
// Creating Pangaea pangaea_views_stats_tmp table
logger.info("Creating Pangaea pangaea_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp AS "
+ "SELECT 'PANGAEA' as source, 're3data_____::9633d1e8c4309c833c2c442abeb0cfeb' as repository_id,"
+ "r.id as result_id,date, cast(count as BIGINT) as count, 0 as openaire "
+ "FROM default.pangaeaviews p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.result_id";
stmt.executeUpdate(sql);
logger.info("Created pangaea_views_stats_tmp table ");
// Creating Pangaea pangaea_downloads_stats_tmp table
logger.info("Creating Pedocs pangaea_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats_tmp AS "
+ "SELECT 'PANGAEA' as source, 're3data_____::9633d1e8c4309c833c2c442abeb0cfeb' as repository_id,"
+ "r.id as result_id, date, cast(count as BIGINT) as count, 0 as openaire "
+ "FROM default.pangaeadownloads p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.result_id";
stmt.executeUpdate(sql);
logger.info("Created pangaea_downloads_stats_tmp table ");
}
public void uploadTUDELFTStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
@ -499,7 +704,8 @@ public class PiwikStatsDB {
String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM "
+ ConnectDB.getStatsDBSchema() +
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true)";
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) "
+ "and websiteurl!='https://episciences.org/'";
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
.prepareStatement(returnEpisciencesJournals);
@ -525,12 +731,11 @@ public class PiwikStatsDB {
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
+ "AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".episcienceslog where action='action' and (source_item_type='oaItem' or "
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".episcienceslogdistinct where action='action' and (source_item_type='oaItem' or "
+ "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'"
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
+ "source ORDER BY source, entity_id";
stmt.executeUpdate(create_result_views_monthly);
logger.info("Created episciencesSuffix_result_views_monthly_tmp table");
@ -547,6 +752,23 @@ public class PiwikStatsDB {
stmt.executeUpdate(insertIntoEpisciencesViewsTable);
logger.info("Inserted episciencesSuffix_result_views_monthly_tmp into EpisciencesViews Table");
logger
.info(
"Inserting episciencesSuffix_result_views_monthly_tmp for Episciences into EpisciencesViews Table");
String insertIntoEpisciencesViewsAllTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".episciencesviews SELECT 'Episciences' as source, "
+ " 'openaire____::6824b298c96ba906a3e6a70593affbf5' as repository_id, ro.id as result_id, month as date,"
+ " max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
+ "." + episciencesSuffix.replace("-", "_") + "_result_views_monthly_tmp p,"
+ ConnectDB.getStatsDBSchema()
+ ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month";
logger
.info(
"Inserted episciencesSuffix_result_views_monthly_tmp for Episciences into EpisciencesViews Table");
stmt.executeUpdate(insertIntoEpisciencesViewsAllTable);
stmt.executeUpdate(dropepisciencesSuffixView);
logger.info("Dropped episciencesSuffix_result_views_monthly_tmp view");
}
@ -576,7 +798,8 @@ public class PiwikStatsDB {
String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM "
+ ConnectDB.getStatsDBSchema() +
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true)";
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) "
+ "and websiteurl!='https://episciences.org/'";
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
.prepareStatement(returnEpisciencesJournals);
@ -600,8 +823,8 @@ public class PiwikStatsDB {
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
+ "AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".episcienceslog where action='download' and (source_item_type='oaItem' or "
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".episcienceslogdistinct where action='download' and (source_item_type='oaItem' or "
+ "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'"
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
+ "source ORDER BY source, entity_id";
@ -622,6 +845,23 @@ public class PiwikStatsDB {
stmt.executeUpdate(insertIntoEpisciencesDownloadsTable);
logger.info("Inserted episciencesSuffix_result_downloads_monthly_tmp into EpisciencesDownloadsTable");
logger
.info(
"Inserting episciencesSuffix_result_downloads_monthly_tmp for Episciences into EpisciencesDownloadsTable");
String insertIntoEpisciencesDownloadsAllTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".episciencesdownloads SELECT 'Episciences' as source, "
+ " 'openaire____::6824b298c96ba906a3e6a70593affbf5' as repository_id, ro.id as result_id, month as date,"
+ " max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
+ "." + episciencesSuffix.replace("-", "_") + "_result_downloads_monthly_tmp p,"
+ ConnectDB.getStatsDBSchema()
+ ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month";
stmt.executeUpdate(insertIntoEpisciencesDownloadsAllTable);
logger
.info(
"Inserted episciencesSuffix_result_downloads_monthly_tmp for Episciences into EpisciencesDownloadsTable");
stmt.executeUpdate(dropepisciencesSuffixDownloads);
logger.info("Dropped episciencesSuffix_result_downloads_monthly_tmp view");
@ -767,9 +1007,15 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Dropped Table tbl_all_r5_metrics");
logger.info("Create Table tbl_all_r5_metrics");
// All CoP R5 metrics Table
logger.info("Drop Table counter_r5_stats_with_metrics");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics ";
stmt.executeUpdate(sql);
logger.info("Dropped Table counter_r5_stats_with_metrics");
logger.info("Create Table counter_r5_stats_with_metrics");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".tbl_all_r5_metrics as "
+ ".counter_r5_stats_with_metrics as "
+ "WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id, "
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
+ "coalesce(vs.unique_item_investigations, 0) as unique_item_investigations, "
@ -793,7 +1039,7 @@ public class PiwikStatsDB {
+ "FROM tmp2 AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".tbl_total_item_requests "
+ "AS vs ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
stmt.executeUpdate(sql);
logger.info("Created Table tbl_all_r5_metrics");
logger.info("Created Table counter_r5_stats_with_metrics");
stmt.close();
ConnectDB.getHiveConnection().close();
@ -857,6 +1103,13 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Pedocs views updated to views_stats");
// Inserting Pangaea views stats
logger.info("Inserting Pangaea old data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Pangaea views updated to views_stats");
// Inserting TUDELFT views stats
logger.info("Inserting TUDELFT data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
@ -878,6 +1131,12 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("B2SHARE views updated to views_stats");
// Inserting Datacite views stats
logger.info("Inserting Datacite views to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views";
stmt.executeUpdate(sql);
logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
@ -893,7 +1152,7 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Inserted OpenAIRE data to downloads_stats");
// Inserting Episciences views stats
// Inserting Episciences downloads stats
logger.info("Inserting Episciences data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".episciencesdownloads";
@ -907,6 +1166,13 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Inserted Pedocs data to downloads_stats");
// Inserting Pangaea downloads stats
logger.info("Inserting Pangaea old data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pangaea_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Pangaea downloads updated to downloads_stats");
// Inserting TUDELFT downloads stats
logger.info("Inserting TUDELFT data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
@ -920,6 +1186,7 @@ public class PiwikStatsDB {
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted B2SHARE data to downloads_stats");
// Inserting Lareferencia downloads stats
logger.info("Inserting LaReferencia data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
@ -934,14 +1201,13 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("IRUS downloads updated to downloads_stats");
// Inserting IRUS_R5 downloads stats
// Inserting IRUS_R5 views stats
logger.info("Inserting IRUS_R5 views to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT source, repository_id, result_id, `date`, views, openaire FROM "
+ ConnectDB.getUsageStatsDBSchema()
+ ".irus_R5_stats_tmp";
stmt.executeUpdate(sql);
logger.info("IRUS_R5 views updated to views_stats");
// Inserting IRUS_R5 downloads stats
logger.info("Inserting IRUS_R5 data to downloads_stats");
@ -950,7 +1216,6 @@ public class PiwikStatsDB {
+ ConnectDB.getUsageStatsDBSchema()
+ ".irus_R5_stats_tmp";
stmt.executeUpdate(sql);
logger.info("IRUS_R5 downloads updated to downloads_stats");
// Inserting SARC-OJS downloads stats
logger.info("Inserting SARC data to downloads_stats");
@ -959,19 +1224,11 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("SARC-OJS downloads updated to downloads_stats");
// Inserting Datacite views stats
logger.info("Inserting Datacite views to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views";
stmt.executeUpdate(sql);
logger.info("Datacite views updated to views_stats");
// Inserting Datacite downloads stats
logger.info("Inserting Datacite downloads to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_downloads";
stmt.executeUpdate(sql);
logger.info("Datacite downloads updated to downloads_stats");
logger.info("Creating pageviews_stats table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
@ -986,28 +1243,6 @@ public class PiwikStatsDB {
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropping full_dates table");
String dropFullDates = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".full_dates";
stmt.executeUpdate(dropFullDates);
logger.info("Dropped full_dates table");
Calendar startCalendar = Calendar.getInstance();
startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
Calendar endCalendar = Calendar.getInstance();
int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
logger.info("Creating full_dates table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS "
+ "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date "
+ "FROM (SELECT DATE '2016-01-01' AS from_date) p "
+ "LATERAL VIEW "
+ "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
stmt.executeUpdate(sql);
logger.info("Created full_dates table");
logger.info("Inserting data to usage_stats");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS "
+ "SELECT coalesce(ds.source, vs.source) as source, "
@ -1022,15 +1257,77 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Inserted data to usage_stats");
// Dropping project_stats table
logger.info("Dropping project_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".project_stats";
logger.info("Dropped project_stats table ");
stmt.executeUpdate(sql);
// Dropping datasource_stats table
logger.info("Dropping datasource_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats";
logger.info("Dropped datasource_stats table ");
stmt.executeUpdate(sql);
logger.info("Inserting data to project_downloads");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".project_downloads as "
+ " select pr.id, sum(count) downloads, sum(openaire) openaire_downloads,`date` "
+ " from " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ " join " + ConnectDB.getStatsDBSchema() + ".project_results pr on result_id=pr.result "
+ " join " + ConnectDB.getStatsDBSchema() + ".project p on p.id=pr.id "
+ " group by pr.id,`date`";
stmt.executeUpdate(sql);
logger.info("Inserted data to projects_downloads");
logger.info("Inserting data to project_views");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".project_views as "
+ " select pr.id, sum(count) views, sum(openaire) openaire_views,`date` "
+ " from " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ " join " + ConnectDB.getStatsDBSchema() + ".project_results pr on result_id=pr.result "
+ " join " + ConnectDB.getStatsDBSchema() + ".project p on p.id=pr.id "
+ " group by pr.id,`date`";
stmt.executeUpdate(sql);
logger.info("Inserted data to project_views");
logger.info("Inserting data to project_stats");
sql = " CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".project_stats as "
+ " SELECT coalesce(pv.id, pd.id) as id, coalesce(pd.`date`, pv.`date`) as `date`, "
+ " coalesce(pv.views, 0) as views, coalesce(pd.downloads, 0) as downloads, "
+ " coalesce(pv.openaire_views,0) as openaire_views,coalesce(pd.openaire_downloads, 0) as openaire_downloads "
+ " FROM " + ConnectDB.getUsageStatsDBSchema() + ".project_downloads pd "
+ " FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".project_views pv "
+ " ON pd.id=pv.id WHERE pd.`date`=pv.`date`";
stmt.executeUpdate(sql);
logger.info("Inserted data to project_stats");
logger.info("Inserting data to datasource_stats");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats AS "
+ " with datasource_views as "
+ " (select repository_id, sum(views) views, sum(openaire_views) openaire_views,`date` "
+ " from " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats group by repository_id,`date`), "
+ " datasource_downloads as "
+ " (select repository_id, sum(downloads) downloads,sum(openaire_downloads) openaire_downloads,`date` "
+ " from " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats group by repository_id,`date`)"
+ " SELECT coalesce(dv.repository_id, dd.repository_id) as repository_id, coalesce(dd.`date`, dv.`date`) as `date`,"
+ " coalesce(dv.views, 0) as views, coalesce(dd.downloads, 0) as downloads, "
+ " coalesce(dv.openaire_views) as openaire_views,coalesce(dd.openaire_downloads, 0) as openaire_downloads "
+ " FROM datasource_downloads dd "
+ " FULL OUTER JOIN "
+ " datasource_views dv ON dd.repository_id=dv.repository_id WHERE dd.`date`=dv.`date`";
stmt.executeUpdate(sql);
logger.info("Inserted data to datasource_stats");
// Inserting LaReferencia CoP R5 Metrics
logger.info("Inserting Lareferencia data to tbl_all_r5_metrics");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics "
logger.info("Inserting Lareferencia data to counter_r5_stats_with_metrics");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics";
stmt.executeUpdate(sql);
// Inserting IRUS-UK CoP R5 Metrics
logger.info("Inserting IRUS-UK data into tbl_all_r5_metrics");
String insertΡ5Stats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics "
logger.info("Inserting IRUS-UK data into counter_r5_stats_with_metrics");
String insertΡ5Stats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics "
+ "SELECT s.source, d.id AS repository_id, "
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, "
+ "s.unique_item_investigations , s.total_item_investigations, "
@ -1040,7 +1337,7 @@ public class PiwikStatsDB {
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND s.source='IRUS-UK'";
stmt.executeUpdate(insertΡ5Stats);
logger.info("Inserted IRUS-UK data into tbl_all_r5_metrics");
logger.info("Inserted IRUS-UK data into counter_r5_stats_with_metrics");
logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis()));
@ -1088,6 +1385,28 @@ public class PiwikStatsDB {
stmt.executeUpdate(sql);
logger.info("Created view on usage_stats on permanent usagestats DB");
logger.info("Dropping view projects_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".projects_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on projects_stats on permanent usagestats DB");
logger.info("Create view on project_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".project_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".project_stats";
stmt.executeUpdate(sql);
logger.info("Created view on project_stats on permanent usagestats DB");
logger.info("Dropping view datasource_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on projects_stats on permanent usagestats DB");
logger.info("Create view on datasource_stats on permanent usagestats DB");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats";
stmt.executeUpdate(sql);
logger.info("Created view on project_stats on permanent usagestats DB");
logger.info("Dropping view COUNTER_R5_Metrics on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".counter_r5_stats_with_metrics";
stmt.executeUpdate(sql);
@ -1096,7 +1415,7 @@ public class PiwikStatsDB {
logger.info("Create view on COUNTER_R5_Metrics on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema()
+ ".counter_r5_stats_with_metrics"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tbl_all_r5_metrics";
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics";
stmt.executeUpdate(sql);
logger.info("Created view on COUNTER_R5_Metrics on permanent usagestats DB");

View File

@ -29,21 +29,16 @@ public class UsageStatsExporter {
logger.info("Initialising DB properties");
ConnectDB.init();
// runImpalaQuery();
PiwikStatsDB piwikstatsdb = new PiwikStatsDB();
logger.info("Re-creating database and tables");
if (ExecuteWorkflow.recreateDbAndTables) {
piwikstatsdb.recreateDBAndTables();
logger.info("DB-Tables are created ");
}
// else {
// piwikstatsdb.createTmpTables();
// logger.info("TmpTables are created ");
// }
//to remove logger.info("Re-creating database and tables");
// if (ExecuteWorkflow.recreateDbAndTables) {
// piwikstatsdb.recreateDBAndTables();
// logger.info("DB-Tables are created ");
//to remove }
if (ExecuteWorkflow.processPiwikLogs) {
logger.info("Creating distinct piwik log");
piwikstatsdb.createDistinctPiwikLog();
logger.info("Processing OpenAIRE logs");
//to remove logger.info("Creating distinct piwik log");
// piwikstatsdb.createDistinctPiwikLog();
//to remove logger.info("Processing OpenAIRE logs");
piwikstatsdb.processLogs();
logger.info("OpenAIRE logs Done");
logger.info("Processing Episciences logs");
@ -52,6 +47,9 @@ public class UsageStatsExporter {
logger.info("Processing Pedocs Old Stats");
piwikstatsdb.uploadOldPedocs();
logger.info("Processing Pedocs Old Stats Done");
logger.info("Processing Pangaea Stats");
piwikstatsdb.uploadPangaeaLogs();
logger.info("Processing Pangaea Stats Done");
logger.info("Processing TUDELFT Stats");
piwikstatsdb.uploadTUDELFTStats();
logger.info("Processing TUDELFT Stats Done");
@ -116,6 +114,18 @@ public class UsageStatsExporter {
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".project_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".datasource_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".counter_r5_stats_with_metrics";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
@ -125,6 +135,12 @@ public class UsageStatsExporter {
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".project_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".datasource_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);

View File

@ -9,8 +9,34 @@ fi
export SOURCE=$1
export PRODUCTION=$2
#echo "Updating ${PRODUCTION} database"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
#echo "Production db ready!"
echo "Updating ${PRODUCTION} database"
impala-shell -q "create database if not exists ${PRODUCTION}"
impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Production db ready!"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.views_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.views_stats as SELECT * from ${SOURCE}.views_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.views_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.pageviews_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.pageviews_stats as SELECT * from ${SOURCE}.pageviews_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.pageviews_stats as SELECT * from ${SOURCE}.pageviews_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.pageviews_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.downloads_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.downloads_stats as SELECT * from ${SOURCE}.downloads_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.downloads_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.usage_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.usage_stats as SELECT * from ${SOURCE}.usage_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.usage_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.project_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.project_stats as SELECT * from ${SOURCE}.project_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.project_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.datasource_stats;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.datasource_stats as SELECT * from ${SOURCE}.datasource_stats;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.datasource_stats;"
impala-shell -d ${PRODUCTION} -q "DROP VIEW IF EXISTS ${PRODUCTION}.counter_r5_stats_with_metrics;"
impala-shell -d ${PRODUCTION} -q "CREATE VIEW ${PRODUCTION}.counter_r5_stats_with_metrics as SELECT * from ${SOURCE}.counter_r5_stats_with_metrics;"
impala-shell -d ${PRODUCTION} -q "INVALIDATE METADATA ${PRODUCTION}.counter_r5_stats_with_metrics;"
echo "Production db ready!"

View File

@ -1,11 +1,11 @@
<workflow-app name="Usage Stats" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>usage_stats_db_name</name>
<name>usage_stats_db</name>
<description>the target usage stats database name</description>
</property>
<property>
<name>usage_stats_db_production_name</name>
<name>usage_stats_db_production</name>
<description>the name of the public production usage stats database</description>
</property>
<property>
@ -48,8 +48,8 @@
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateProductionViews.sh</exec>
<argument>${usage_stats_db_name}</argument>
<argument>${usage_stats_db_production_name}</argument>
<argument>${usage_stats_db}</argument>
<argument>${usage_stats_db_production}</argument>
<file>updateProductionViews.sh</file>
</shell>
<ok to="End"/>

View File

@ -1,91 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-stats-build</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20180130</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${cdh.hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${cdh.hadoop.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
<type>jar</type>
</dependency>
</dependencies>
<name>dhp-usage-stats-build</name>
</project>