package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.*; import java.net.URLDecoder; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author D. Pierrakos, S. Zoupanos */ public class LaReferenciaStats { private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); private String logRepoPath; private Statement stmt = null; private String CounterRobotsURL; private ArrayList robotsList; public LaReferenciaStats() throws Exception { } public void createDistinctLaReferenciaLog() throws Exception { logger.info("Initialising DB properties"); ConnectDB.init(); Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Dropping lareferencialogdistinct"); String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct"; stmt.executeUpdate(sql); logger.info("Dropped lareferencialogdistinct"); logger.info("Creating lareferencialogdistinct table"); // Create Piwiklogdistinct table - This table should exist String sqlCreateTablePiwikLogDistinct = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct(matomoid INT, source STRING, id_visit STRING, country STRING, action STRING, url STRING, " + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + "clustered by (source, id_visit, action, timestamp, entity_id) " + "into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTablePiwikLogDistinct); logger.info("Created lareferencialogdistinct table"); logger.info("Inserting data to lareferencialogdistinct"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "SELECT DISTINCT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog WHERE entity_id is not null"; stmt.executeUpdate(sql); logger.info("Inserted data to lareferencialogdistinct"); } public void processLogs() throws Exception { try { logger.info("Creating LareferenciaLogDistinct"); createDistinctLaReferenciaLog(); logger.info("LaReferencia creating viewsStats"); viewsStats(); logger.info("LaReferencia created viewsStats"); logger.info("LaReferencia creating downloadsStats"); downloadsStats(); logger.info("LaReferencia created downloadsStats"); logger.info("LaReferencia creating COUNTER CoP R5 metrics"); createCoPR5TablesForLareferencia(); logger.info("LaReferencia created COUNTER CoP R5 metrics"); // logger.info("LaReferencia updating Production Tables"); // updateProdTables(); // logger.info("LaReferencia updated Production Tables"); } catch (Exception e) { logger.error("Failed to process logs: " + e); throw new Exception("Failed to process logs: " + e.toString(), e); } } public void viewsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Creating la_result_views_monthly_tmp view"); String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS " + "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + "THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='action' and " + "(source_item_type='oaItem' or source_item_type='repItem') " + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; stmt.executeUpdate(sql); logger.info("Created la_result_views_monthly_tmp view"); logger.info("Dropping la_views_stats_tmp table"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; stmt.executeUpdate(sql); logger.info("Dropped la_views_stats_tmp table"); logger.info("Creating la_views_stats_tmp table"); sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + "max(views) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.oid AND p.id=ro.oid " + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month"; stmt.executeUpdate(sql); logger.info("Created la_views_stats_tmp table"); stmt.close(); // ConnectDB.getHiveConnection().close(); } private void downloadsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Creating la_result_downloads_monthly_tmp view"); String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp AS " + "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + "THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct where action='download' and " + "(source_item_type='oaItem' or source_item_type='repItem') " + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; stmt.executeUpdate(sql); logger.info("Created la_result_downloads_monthly_tmp view"); logger.info("Dropping la_downloads_stats_tmp table"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; stmt.executeUpdate(sql); logger.info("Dropped la_downloads_stats_tmp table"); logger.info("Creating la_downloads_stats_tmp table"); sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + "max(downloads) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.oid AND p.id=ro.oid " + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month"; stmt.executeUpdate(sql); logger.info("Created la_downloads_stats_tmp table"); stmt.close(); // ConnectDB.getHiveConnection().close(); } private void createCoPR5TablesForLareferencia() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); // Unique Item Investigations logger.info("Create View Unique_Item_Investigations"); String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_investigations " + "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; stmt.executeUpdate(sql); logger.info("Created View Unique_Item_Investigations"); logger.info("Drop Table Unique_Item_Investigations"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations "; stmt.executeUpdate(sql); logger.info("Dropped Table Unique_Item_Investigations"); logger.info("Create Table tbl_unique_item_investigations"); sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations as " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + "sum(unique_item_investigations) AS unique_item_investigations, sum(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_investigations p, " + ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' " + "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created Table tbl_unique_item_investigations"); // Total Item Investigations logger.info("Create View lr_view_total_item_investigations"); sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_investigations " + "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + "COUNT(entity_id) AS total_item_investigations, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; stmt.executeUpdate(sql); logger.info("Created View lr_view_total_item_investigations"); logger.info("Drop Table lr_tbl_total_item_investigations"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations "; stmt.executeUpdate(sql); logger.info("Dropped Table lr_tbl_total_item_investigations"); logger.info("Create Table lr_tbl_total_item_investigations"); sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations as " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + "sum(total_item_investigations) AS total_item_investigations, sum(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_investigations p, " + ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' " + "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created Table lr_tbl_total_item_investigations"); // Unique Item Requests logger.info("Create View lr_view_unique_item_requests"); sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_requests AS " + "SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; stmt.executeUpdate(sql); logger.info("Created View lr_view_unique_item_requests"); logger.info("Drop Table Unique_Item_Requests"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests "; stmt.executeUpdate(sql); logger.info("Dropped Table Unique_Item_Requests"); logger.info("Create Table lr_tbl_unique_item_requests"); sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests as " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + "sum(unique_item_requests) AS unique_item_requests, sum(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_requests p, " + ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' " + "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created Table lr_tbl_unique_item_requests"); // Total Item Requests logger.info("Create View lr_view_total_item_requests"); sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_requests " + "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + "COUNT(entity_id) AS total_item_requests, " + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogdistinct " + "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') " + "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, " + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source "; stmt.executeUpdate(sql); logger.info("Created View lr_view_total_item_requests"); logger.info("Drop Table lr_tbl_total_item_requests"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests "; stmt.executeUpdate(sql); logger.info("Dropped Table lr_tbl_total_item_requests"); logger.info("Create Table lr_tbl_total_item_requests"); sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests as " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + "sum(total_item_requests) AS total_item_requests, sum(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".view_total_item_requests p, " + ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' " + "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created Table lr_tbl_total_item_requests"); // All CoP R5 metrics Table logger.info("Drop Table lr_tbl_all_r5_metrics"); sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics "; stmt.executeUpdate(sql); logger.info("Dropped Table lr_tbl_all_r5_metrics"); logger.info("Create Table lr_tbl_all_r5_metrics"); sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics as " + "WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id, " + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + "coalesce(vs.unique_item_investigations, 0) as unique_item_investigations, " + "coalesce(ds.total_item_investigations, 0) as total_item_investigations " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations AS vs " + "FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations AS ds " + " ON ds.source=vs.source AND ds.result_id=vs.result_id AND ds.date=vs.date), " + "tmp2 AS (select coalesce (ds.repository_id, vs.repository_id) as repository_id, " + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + "coalesce(ds.total_item_investigations, 0) as total_item_investigations, " + "coalesce(ds.unique_item_investigations, 0) as unique_item_investigations, " + " coalesce(vs.unique_item_requests, 0) as unique_item_requests FROM tmp1 " + "AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests AS vs " + "ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date) " + "SELECT 'LaReferencia' as source, coalesce (ds.repository_id, vs.repository_id) as repository_id, " + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + "coalesce(ds.unique_item_investigations, 0) as unique_item_investigations, " + "coalesce(ds.total_item_investigations, 0) as total_item_investigations, " + "coalesce(ds.unique_item_requests, 0) as unique_item_requests, " + "coalesce(vs.total_item_requests, 0) as total_item_requests " + "FROM tmp2 AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests " + "AS vs ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; stmt.executeUpdate(sql); logger.info("Created Table tbl_all_r5_metrics"); stmt.close(); ConnectDB.getHiveConnection().close(); } }