forked from D-Net/dnet-hadoop
Changes in usage stats update wf
This commit is contained in:
parent
566e2459a8
commit
6b247524a8
|
@ -1,140 +0,0 @@
|
|||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
*/
|
||||
public class ExecuteWorkflow {
|
||||
|
||||
// static String matomoAuthToken;
|
||||
static String portalMatomoID;
|
||||
// static String irusUKBaseURL;
|
||||
// static String lareferenciaBaseURL;
|
||||
// static String lareferenciaAuthToken;
|
||||
static String dbHiveUrl;
|
||||
static String dbImpalaUrl;
|
||||
static String usageRawDataDBSchema;
|
||||
static String usageStatsDBSchema;
|
||||
static String usagestatsPermanentDBSchema;
|
||||
static String statsDBSchema;
|
||||
static boolean recreateDbAndTables;
|
||||
|
||||
static boolean processPiwikLogs;
|
||||
static boolean processLaReferenciaLogs;
|
||||
|
||||
static boolean irusProcessStats;
|
||||
|
||||
static boolean sarcProcessStats;
|
||||
|
||||
static boolean finalizeStats;
|
||||
static boolean finalTablesVisibleToImpala;
|
||||
|
||||
static int numberOfDownloadThreads;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
// Sending the logs to the console
|
||||
BasicConfigurator.configure();
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
UsageStatsExporter.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
// Setting up the initial parameters
|
||||
// matomoAuthToken = parser.get("matomoAuthToken");
|
||||
// matomoBaseURL = parser.get("matomoBaseURL");
|
||||
portalMatomoID = parser.get("portalMatomoID");
|
||||
// irusUKBaseURL = parser.get("irusUKBaseURL");
|
||||
// lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
|
||||
// lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
|
||||
|
||||
dbHiveUrl = parser.get("dbHiveUrl");
|
||||
dbImpalaUrl = parser.get("dbImpalaUrl");
|
||||
usageRawDataDBSchema = parser.get("usageRawDataDBSchema");
|
||||
usageStatsDBSchema = parser.get("usageStatsDBSchema");
|
||||
usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema");
|
||||
statsDBSchema = parser.get("statsDBSchema");
|
||||
|
||||
if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
|
||||
processPiwikLogs = true;
|
||||
} else {
|
||||
processPiwikLogs = false;
|
||||
}
|
||||
|
||||
// String startingLogPeriodStr = parser.get("startingLogPeriod");
|
||||
// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
|
||||
// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
|
||||
//
|
||||
// String endingLogPeriodStr = parser.get("endingLogPeriod");
|
||||
// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
|
||||
// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
|
||||
|
||||
if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
|
||||
recreateDbAndTables = true;
|
||||
} else {
|
||||
recreateDbAndTables = false;
|
||||
}
|
||||
|
||||
if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
|
||||
processLaReferenciaLogs = true;
|
||||
} else {
|
||||
processLaReferenciaLogs = false;
|
||||
}
|
||||
|
||||
if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
|
||||
irusProcessStats = true;
|
||||
} else {
|
||||
irusProcessStats = false;
|
||||
}
|
||||
|
||||
if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
|
||||
sarcProcessStats = true;
|
||||
} else {
|
||||
sarcProcessStats = false;
|
||||
}
|
||||
|
||||
if (parser.get("finalizeStats").toLowerCase().equals("true")) {
|
||||
finalizeStats = true;
|
||||
} else {
|
||||
finalizeStats = false;
|
||||
}
|
||||
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) {
|
||||
finalTablesVisibleToImpala = true;
|
||||
} else {
|
||||
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
|
||||
}
|
||||
|
||||
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
|
||||
usagestatsExport.export();
|
||||
}
|
||||
|
||||
private static Calendar startingLogPeriodStr(Date date) {
|
||||
|
||||
Calendar calendar = Calendar.getInstance();
|
||||
calendar.setTime(date);
|
||||
return calendar;
|
||||
|
||||
}
|
||||
}
|
|
@ -1,95 +0,0 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.Statement;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.json.simple.JSONArray;
|
||||
import org.json.simple.JSONObject;
|
||||
import org.json.simple.parser.JSONParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
*/
|
||||
public class IrusStats {
|
||||
|
||||
private String irusUKURL;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
|
||||
|
||||
public IrusStats() throws Exception {
|
||||
}
|
||||
|
||||
public void processIrusStats() throws Exception {
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Creating irus_downloads_stats_tmp table");
|
||||
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".irus_downloads_stats_tmp "
|
||||
+ "(`source` string, "
|
||||
+ "`repository_id` string, "
|
||||
+ "`result_id` string, "
|
||||
+ "`date` string, "
|
||||
+ "`count` bigint, "
|
||||
+ "`openaire` bigint)";
|
||||
stmt.executeUpdate(createDownloadsStats);
|
||||
logger.info("Created irus_downloads_stats_tmp table");
|
||||
|
||||
logger.info("Inserting into irus_downloads_stats_tmp");
|
||||
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp "
|
||||
+ "SELECT s.source, d.id AS repository_id, "
|
||||
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
|
||||
stmt.executeUpdate(insertDStats);
|
||||
logger.info("Inserted into irus_downloads_stats_tmp");
|
||||
|
||||
String createR5Stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".irus_R5_stats_tmp "
|
||||
+ "(`source` string, "
|
||||
+ "`repository_id` string, "
|
||||
+ "`result_id` string, "
|
||||
+ "`date` string, "
|
||||
+ "`views` bigint, "
|
||||
+ "`downloads` bigint, "
|
||||
+ "`openaire` bigint)";
|
||||
stmt.executeUpdate(createR5Stats);
|
||||
logger.info("Created irus_R5_stats_tmp table");
|
||||
|
||||
logger.info("Inserting into irus_R5_stats_tmp");
|
||||
String insertΡ5Stats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_R5_stats_tmp "
|
||||
+ "SELECT s.source, d.id AS repository_id, "
|
||||
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, "
|
||||
+ "(s.total_item_investigations-s.total_item_requests) as views, s.total_item_requests as downloads, '0' "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog_cop_r5 s, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND s.source='IRUS-UK'";
|
||||
stmt.executeUpdate(insertΡ5Stats);
|
||||
logger.info("Inserted into irus_R5_stats_tmp");
|
||||
|
||||
stmt.close();
|
||||
// ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
//// to add create table sushilog_cop_r5 as select * from openaire_prod_usage_raw.sushilog_cop_r5
|
||||
//// to add create table sushilog_cop_r5 as select * from openaire_prod_usage_raw.sushilog_cop_r5
|
||||
|
||||
}
|
|
@ -1,321 +0,0 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URLDecoder;
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.sql.Timestamp;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.json.simple.JSONArray;
|
||||
import org.json.simple.JSONObject;
|
||||
import org.json.simple.parser.JSONParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
*/
|
||||
public class LaReferenciaStats {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class);
|
||||
|
||||
private String logRepoPath;
|
||||
|
||||
private Statement stmt = null;
|
||||
|
||||
private String CounterRobotsURL;
|
||||
private ArrayList robotsList;
|
||||
|
||||
public LaReferenciaStats() throws Exception {
|
||||
}
|
||||
|
||||
public void processLogs() throws Exception {
|
||||
try {
|
||||
logger.info("LaReferencia creating viewsStats");
|
||||
viewsStats();
|
||||
logger.info("LaReferencia created viewsStats");
|
||||
|
||||
logger.info("LaReferencia creating downloadsStats");
|
||||
downloadsStats();
|
||||
logger.info("LaReferencia created downloadsStats");
|
||||
|
||||
logger.info("LaReferencia creating COUNTER CoP R5 metrics");
|
||||
createCoPR5TablesForLareferencia();
|
||||
logger.info("LaReferencia created COUNTER CoP R5 metrics");
|
||||
|
||||
// logger.info("LaReferencia updating Production Tables");
|
||||
// updateProdTables();
|
||||
// logger.info("LaReferencia updated Production Tables");
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process logs: " + e);
|
||||
throw new Exception("Failed to process logs: " + e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public void viewsStats() throws Exception {
|
||||
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Creating la_result_views_monthly_tmp view");
|
||||
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS "
|
||||
+
|
||||
"SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
|
||||
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
|
||||
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
|
||||
"FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='action' and " +
|
||||
"(source_item_type='oaItem' or source_item_type='repItem') " +
|
||||
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
|
||||
"source ORDER BY source, entity_id";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created la_result_views_monthly_tmp view");
|
||||
|
||||
logger.info("Dropping la_views_stats_tmp table");
|
||||
sql = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".la_views_stats_tmp";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped la_views_stats_tmp table");
|
||||
|
||||
logger.info("Creating la_views_stats_tmp table");
|
||||
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " +
|
||||
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
|
||||
"max(views) AS count, max(openaire_referrer) AS openaire " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " +
|
||||
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
|
||||
"WHERE p.source=d.oid AND p.id=ro.oid " +
|
||||
"GROUP BY d.id, ro.id, month " +
|
||||
"ORDER BY d.id, ro.id, month";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created la_views_stats_tmp table");
|
||||
|
||||
stmt.close();
|
||||
// ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
|
||||
private void downloadsStats() throws Exception {
|
||||
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Creating la_result_downloads_monthly_tmp view");
|
||||
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".la_result_downloads_monthly_tmp AS " +
|
||||
"SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
|
||||
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
|
||||
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
|
||||
"FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='download' and " +
|
||||
"(source_item_type='oaItem' or source_item_type='repItem') " +
|
||||
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
|
||||
"source ORDER BY source, entity_id";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created la_result_downloads_monthly_tmp view");
|
||||
|
||||
logger.info("Dropping la_downloads_stats_tmp table");
|
||||
sql = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".la_downloads_stats_tmp";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped la_downloads_stats_tmp table");
|
||||
|
||||
logger.info("Creating la_downloads_stats_tmp table");
|
||||
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " +
|
||||
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
|
||||
"max(downloads) AS count, max(openaire_referrer) AS openaire " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " +
|
||||
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
|
||||
"WHERE p.source=d.oid AND p.id=ro.oid " +
|
||||
"GROUP BY d.id, ro.id, month " +
|
||||
"ORDER BY d.id, ro.id, month";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created la_downloads_stats_tmp table");
|
||||
|
||||
stmt.close();
|
||||
// ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
|
||||
private void createCoPR5TablesForLareferencia() throws Exception {
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
// Unique Item Investigations
|
||||
|
||||
logger.info("Create View Unique_Item_Investigations");
|
||||
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".lr_view_unique_item_investigations "
|
||||
+ "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
|
||||
+ "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations, "
|
||||
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
|
||||
+ "WHERE (source_item_type='oaItem' or source_item_type='repItem') "
|
||||
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created View Unique_Item_Investigations");
|
||||
|
||||
logger.info("Drop Table Unique_Item_Investigations");
|
||||
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped Table Unique_Item_Investigations");
|
||||
|
||||
logger.info("Create Table tbl_unique_item_investigations");
|
||||
sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations as "
|
||||
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
|
||||
+ "sum(unique_item_investigations) AS unique_item_investigations, sum(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_investigations p, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' "
|
||||
+ "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' "
|
||||
+ "GROUP BY d.id, ro.id, month ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created Table tbl_unique_item_investigations");
|
||||
|
||||
// Total Item Investigations
|
||||
|
||||
logger.info("Create View lr_view_total_item_investigations");
|
||||
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_investigations "
|
||||
+ "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
|
||||
+ "COUNT(entity_id) AS total_item_investigations, "
|
||||
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
|
||||
+ "WHERE (source_item_type='oaItem' or source_item_type='repItem') "
|
||||
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created View lr_view_total_item_investigations");
|
||||
|
||||
logger.info("Drop Table lr_tbl_total_item_investigations");
|
||||
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped Table lr_tbl_total_item_investigations");
|
||||
|
||||
logger.info("Create Table lr_tbl_total_item_investigations");
|
||||
sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations as "
|
||||
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
|
||||
+ "sum(total_item_investigations) AS total_item_investigations, sum(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_investigations p, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' "
|
||||
+ "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' "
|
||||
+ "GROUP BY d.id, ro.id, month ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created Table lr_tbl_total_item_investigations");
|
||||
|
||||
// Unique Item Requests
|
||||
|
||||
logger.info("Create View lr_view_unique_item_requests");
|
||||
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_requests AS "
|
||||
+ "SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
|
||||
+ "CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests, "
|
||||
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
|
||||
+ "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') "
|
||||
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created View lr_view_unique_item_requests");
|
||||
|
||||
logger.info("Drop Table Unique_Item_Requests");
|
||||
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped Table Unique_Item_Requests");
|
||||
|
||||
logger.info("Create Table lr_tbl_unique_item_requests");
|
||||
sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests as "
|
||||
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
|
||||
+ "sum(unique_item_requests) AS unique_item_requests, sum(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_unique_item_requests p, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' "
|
||||
+ "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' "
|
||||
+ "GROUP BY d.id, ro.id, month ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created Table lr_tbl_unique_item_requests");
|
||||
|
||||
// Total Item Requests
|
||||
|
||||
logger.info("Create View lr_view_total_item_requests");
|
||||
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".lr_view_total_item_requests "
|
||||
+ "AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
|
||||
+ "COUNT(entity_id) AS total_item_requests, "
|
||||
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog "
|
||||
+ "WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem') "
|
||||
+ "AND entity_id is NOT NULL GROUP BY id_visit, entity_id, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created View lr_view_total_item_requests");
|
||||
|
||||
logger.info("Drop Table lr_tbl_total_item_requests");
|
||||
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped Table lr_tbl_total_item_requests");
|
||||
|
||||
logger.info("Create Table lr_tbl_total_item_requests");
|
||||
sql = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests as "
|
||||
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
|
||||
+ "sum(total_item_requests) AS total_item_requests, sum(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".view_total_item_requests p, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource d," + ConnectDB.getStatsDBSchema() + ".result_oids ro "
|
||||
+ "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404' "
|
||||
+ "AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' "
|
||||
+ "GROUP BY d.id, ro.id, month ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created Table lr_tbl_total_item_requests");
|
||||
|
||||
// All CoP R5 metrics Table
|
||||
logger.info("Drop Table lr_tbl_all_r5_metrics");
|
||||
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics ";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Dropped Table lr_tbl_all_r5_metrics");
|
||||
|
||||
logger.info("Create Table lr_tbl_all_r5_metrics");
|
||||
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_all_r5_metrics as "
|
||||
+ "WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id, "
|
||||
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
|
||||
+ "coalesce(vs.unique_item_investigations, 0) as unique_item_investigations, "
|
||||
+ "coalesce(ds.total_item_investigations, 0) as total_item_investigations "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_investigations AS vs "
|
||||
+ "FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_investigations AS ds "
|
||||
+ " ON ds.source=vs.source AND ds.result_id=vs.result_id AND ds.date=vs.date), "
|
||||
+ "tmp2 AS (select coalesce (ds.repository_id, vs.repository_id) as repository_id, "
|
||||
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
|
||||
+ "coalesce(ds.total_item_investigations, 0) as total_item_investigations, "
|
||||
+ "coalesce(ds.unique_item_investigations, 0) as unique_item_investigations, "
|
||||
+ " coalesce(vs.unique_item_requests, 0) as unique_item_requests FROM tmp1 "
|
||||
+ "AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_unique_item_requests AS vs "
|
||||
+ "ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date) "
|
||||
+ "SELECT 'LaReferencia' as source, coalesce (ds.repository_id, vs.repository_id) as repository_id, "
|
||||
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
|
||||
+ "coalesce(ds.unique_item_investigations, 0) as unique_item_investigations, "
|
||||
+ "coalesce(ds.total_item_investigations, 0) as total_item_investigations, "
|
||||
+ "coalesce(ds.unique_item_requests, 0) as unique_item_requests, "
|
||||
+ "coalesce(vs.total_item_requests, 0) as total_item_requests "
|
||||
+ "FROM tmp2 AS ds FULL OUTER JOIN " + ConnectDB.getUsageStatsDBSchema() + ".lr_tbl_total_item_requests "
|
||||
+ "AS vs ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
|
||||
stmt.executeUpdate(sql);
|
||||
logger.info("Created Table tbl_all_r5_metrics");
|
||||
|
||||
stmt.close();
|
||||
ConnectDB.getHiveConnection().close();
|
||||
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,107 +0,0 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.io.*;
|
||||
// import java.io.BufferedReader;
|
||||
// import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.json.simple.JSONArray;
|
||||
import org.json.simple.JSONObject;
|
||||
import org.json.simple.parser.JSONParser;
|
||||
import org.json.simple.parser.ParseException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
*/
|
||||
public class SarcStats {
|
||||
|
||||
private Statement stmtHive = null;
|
||||
private Statement stmtImpala = null;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
|
||||
|
||||
public SarcStats() throws Exception {
|
||||
// createTables();
|
||||
}
|
||||
|
||||
private void createTables() throws Exception {
|
||||
try {
|
||||
|
||||
stmtHive = ConnectDB.getHiveConnection().createStatement();
|
||||
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
|
||||
stmtHive.executeUpdate(sqlCreateTableSushiLog);
|
||||
|
||||
// String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
|
||||
// stmt.executeUpdate(sqlCopyPublicSushiLog);
|
||||
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
|
||||
+ " ON INSERT TO sushilog "
|
||||
+ " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
|
||||
+ "sushilog.rid, sushilog.date "
|
||||
+ "FROM sushilog "
|
||||
+ "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
|
||||
stmtHive.executeUpdate(sqlcreateRuleSushiLog);
|
||||
String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
|
||||
stmtHive.executeUpdate(createSushiIndex);
|
||||
|
||||
stmtHive.close();
|
||||
ConnectDB.getHiveConnection().close();
|
||||
logger.info("Sushi Tables Created");
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to create tables: " + e);
|
||||
throw new Exception("Failed to create tables: " + e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public void processSarc() throws Exception {
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Creating sarc_downloads_stats_tmp table");
|
||||
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".sarc_downloads_stats_tmp "
|
||||
+ "(`source` string, "
|
||||
+ "`repository_id` string, "
|
||||
+ "`result_id` string, "
|
||||
+ "`date` string, "
|
||||
+ "`count` bigint, "
|
||||
+ "`openaire` bigint)";
|
||||
stmt.executeUpdate(createDownloadsStats);
|
||||
logger.info("Created sarc_downloads_stats_tmp table");
|
||||
|
||||
logger.info("Inserting into sarc_downloads_stats_tmp");
|
||||
String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp "
|
||||
+ "SELECT s.source, d.id AS repository_id, "
|
||||
+ "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
|
||||
+ "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
|
||||
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
|
||||
+ ConnectDB.getStatsDBSchema() + ".result_pids ro "
|
||||
+ "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
|
||||
+ "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
|
||||
stmt.executeUpdate(insertSarcStats);
|
||||
logger.info("Inserted into sarc_downloads_stats_tmp");
|
||||
|
||||
stmt.close();
|
||||
// ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,137 +0,0 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Main class for downloading and processing Usage statistics
|
||||
*
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
*/
|
||||
public class UsageStatsExporter {
|
||||
|
||||
public UsageStatsExporter() {
|
||||
|
||||
}
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
|
||||
|
||||
public void export() throws Exception {
|
||||
|
||||
logger.info("Initialising DB properties");
|
||||
ConnectDB.init();
|
||||
|
||||
// runImpalaQuery();
|
||||
PiwikStatsDB piwikstatsdb = new PiwikStatsDB();
|
||||
logger.info("Re-creating database and tables");
|
||||
if (ExecuteWorkflow.recreateDbAndTables) {
|
||||
piwikstatsdb.recreateDBAndTables();
|
||||
logger.info("DB-Tables are created ");
|
||||
}
|
||||
// else {
|
||||
// piwikstatsdb.createTmpTables();
|
||||
// logger.info("TmpTables are created ");
|
||||
// }
|
||||
if (ExecuteWorkflow.processPiwikLogs) {
|
||||
logger.info("Creating distinct piwik log");
|
||||
piwikstatsdb.createDistinctPiwikLog();
|
||||
logger.info("Processing OpenAIRE logs");
|
||||
piwikstatsdb.processLogs();
|
||||
logger.info("OpenAIRE logs Done");
|
||||
logger.info("Processing Episciences logs");
|
||||
piwikstatsdb.processEpisciencesLogs();
|
||||
logger.info("Episciences logs Done");
|
||||
logger.info("Processing Pedocs Old Stats");
|
||||
piwikstatsdb.uploadOldPedocs();
|
||||
logger.info("Processing Pedocs Old Stats Done");
|
||||
logger.info("Processing TUDELFT Stats");
|
||||
piwikstatsdb.uploadTUDELFTStats();
|
||||
logger.info("Processing TUDELFT Stats Done");
|
||||
logger.info("Processing B2SHARE Stats");
|
||||
piwikstatsdb.uploadB2SHAREStats();
|
||||
logger.info("Processing B2SHARE Stats Done");
|
||||
|
||||
}
|
||||
|
||||
LaReferenciaStats lastats = new LaReferenciaStats();
|
||||
|
||||
if (ExecuteWorkflow.processLaReferenciaLogs) {
|
||||
logger.info("Processing LaReferencia logs");
|
||||
lastats.processLogs();
|
||||
logger.info("LaReferencia logs done");
|
||||
}
|
||||
|
||||
IrusStats irusstats = new IrusStats();
|
||||
|
||||
if (ExecuteWorkflow.irusProcessStats) {
|
||||
logger.info("Processing IRUS");
|
||||
irusstats.processIrusStats();
|
||||
logger.info("Irus done");
|
||||
}
|
||||
|
||||
SarcStats sarcStats = new SarcStats();
|
||||
|
||||
if (ExecuteWorkflow.sarcProcessStats) {
|
||||
sarcStats.processSarc();
|
||||
}
|
||||
logger.info("Sarc done");
|
||||
|
||||
// finalize usagestats
|
||||
if (ExecuteWorkflow.finalizeStats) {
|
||||
piwikstatsdb.finalizeStats();
|
||||
logger.info("Finalized stats");
|
||||
}
|
||||
|
||||
// Make the tables available to Impala
|
||||
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
|
||||
logger.info("Making tables visible to Impala");
|
||||
invalidateMetadata();
|
||||
}
|
||||
|
||||
logger.info("End");
|
||||
}
|
||||
|
||||
private void invalidateMetadata() throws SQLException {
|
||||
Statement stmt = null;
|
||||
|
||||
stmt = ConnectDB.getImpalaConnection().createStatement();
|
||||
|
||||
String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".counter_r5_stats_with_metrics";
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
stmt.close();
|
||||
ConnectDB.getHiveConnection().close();
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@ import java.util.Calendar;
|
|||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos, S. Zoupanos
|
||||
* @author D. Pierrakos
|
||||
*/
|
||||
import com.mchange.v2.c3p0.ComboPooledDataSource;
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.sql.*;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos
|
||||
*/
|
||||
public class EpisciencesViewsDownloads {
|
||||
|
||||
private String logPath;
|
||||
|
||||
private Statement stmt = null;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EpisciencesViewsDownloads.class);
|
||||
|
||||
public void processEpisciencesLogs() throws Exception {
|
||||
try {
|
||||
|
||||
logger.info("Views Episciences processing starts at: " + new Timestamp(System.currentTimeMillis()));
|
||||
episciencesViewsStats();
|
||||
logger.info("Views Episciences processing ends at: " + new Timestamp(System.currentTimeMillis()));
|
||||
|
||||
logger.info("downloads Episciences processing starts at: " + new Timestamp(System.currentTimeMillis()));
|
||||
episciencesDownloadsStats();
|
||||
logger.info("Downloads Episciences processing ends at: " + new Timestamp(System.currentTimeMillis()));
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process logs: " + e);
|
||||
throw new Exception("Failed to process logs: " + e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public void episciencesViewsStats() throws Exception {
|
||||
logger.info("Creating episciences Views");
|
||||
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Dropping Episcience Views Table");
|
||||
String dropEpisciencesViewsTable = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesviews ";
|
||||
stmt.executeUpdate(dropEpisciencesViewsTable);
|
||||
logger.info("Dropped Episcience Views Table");
|
||||
|
||||
logger.info("Creating Episcience Views Table");
|
||||
String createEpisciencesViewsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesviews (source STRING, repository_id STRING, result_id STRING, date STRING, count INT, openaire INT)"
|
||||
+ " clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true') ";
|
||||
|
||||
stmt.executeUpdate(createEpisciencesViewsTable);
|
||||
|
||||
String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM "
|
||||
+ ConnectDB.getStatsDBSchema() +
|
||||
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) "
|
||||
+ "and substring(regexp_extract(websiteurl,'^([^\\\\.]+)\\\\.?',1),9)!='episciences'";
|
||||
|
||||
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
|
||||
.prepareStatement(returnEpisciencesJournals);
|
||||
ResultSet rs = st.executeQuery();
|
||||
while (rs.next()) {
|
||||
String journal_openaire_id = rs.getString(1);
|
||||
String episciencesSuffix = rs.getString(2);
|
||||
|
||||
logger.info("Working on journal_id:" + journal_openaire_id + " suffix:" + episciencesSuffix);
|
||||
logger.info("Dropping episciencesSuffix_result_views_monthly_tmp table");
|
||||
String dropepisciencesSuffixView = "DROP VIEW " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_views_monthly_tmp";
|
||||
// Statement stmtRS = ConnectDB.getHiveConnection().createStatement();
|
||||
stmt.executeUpdate(dropepisciencesSuffixView);
|
||||
logger.info("Dropped episciencesSuffix_result_views_monthly_tmp table");
|
||||
|
||||
logger.info("Creating episciencesSuffix_result_views_monthly_tmp table");
|
||||
|
||||
String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_views_monthly_tmp "
|
||||
+ "AS SELECT entity_id, "
|
||||
+ "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
|
||||
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
|
||||
+ "AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episcienceslogdistinct where action='action' and (source_item_type='oaItem' or "
|
||||
+ "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'"
|
||||
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
|
||||
+ "source ORDER BY source, entity_id";
|
||||
stmt.executeUpdate(create_result_views_monthly);
|
||||
logger.info("Created episciencesSuffix_result_views_monthly_tmp table");
|
||||
|
||||
logger.info("Inserting episciencesSuffix_result_views_monthly_tmp into EpisciencesViews Table");
|
||||
String insertIntoEpisciencesViewsTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesviews SELECT 'Episciences' as source, '"
|
||||
+ journal_openaire_id + "' as repository_id, ro.id as result_id, month as date,"
|
||||
+ " max(views) AS count, max(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_views_monthly_tmp p,"
|
||||
+ ConnectDB.getStatsDBSchema()
|
||||
+ ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month";
|
||||
|
||||
stmt.executeUpdate(insertIntoEpisciencesViewsTable);
|
||||
logger.info("Inserted episciencesSuffix_result_views_monthly_tmp into EpisciencesViews Table");
|
||||
|
||||
stmt.executeUpdate(dropepisciencesSuffixView);
|
||||
logger.info("Dropped episciencesSuffix_result_views_monthly_tmp view");
|
||||
}
|
||||
rs.close();
|
||||
|
||||
logger.info("Episciences Views Created");
|
||||
}
|
||||
|
||||
public void episciencesDownloadsStats() throws Exception {
|
||||
logger.info("Creating episciences Downloads");
|
||||
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
ConnectDB.getHiveConnection().setAutoCommit(false);
|
||||
|
||||
logger.info("Dropping Episcience Downloads Table");
|
||||
String dropEpisciencesDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesvdownloads ";
|
||||
stmt.executeUpdate(dropEpisciencesDownloadsTable);
|
||||
logger.info("Dropped Episcience Downloads Table");
|
||||
|
||||
logger.info("Creating Episcience Downloads Table");
|
||||
String createEpisciencesDownloadsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesdownloads (source STRING, repository_id STRING, result_id STRING, date STRING, count INT, openaire INT)"
|
||||
+ " clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true') ";
|
||||
|
||||
stmt.executeUpdate(createEpisciencesDownloadsTable);
|
||||
|
||||
String returnEpisciencesJournals = "SELECT id, substring(regexp_extract(websiteurl,'^([^\\.]+)\\.?',1),9) FROM "
|
||||
+ ConnectDB.getStatsDBSchema() +
|
||||
".datasource where websiteurl like '%episciences%' and (dateofvalidation is not null or harvested=true) "
|
||||
+ "and substring(regexp_extract(websiteurl,'^([^\\\\.]+)\\\\.?',1),9)!='episciences'";
|
||||
|
||||
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
|
||||
.prepareStatement(returnEpisciencesJournals);
|
||||
ResultSet rs = st.executeQuery();
|
||||
while (rs.next()) {
|
||||
String journal_openaire_id = rs.getString(1);
|
||||
String episciencesSuffix = rs.getString(2);
|
||||
|
||||
logger.info("Working on journal_id:" + journal_openaire_id + " suffix:" + episciencesSuffix);
|
||||
logger.info("Dropping episciencesSuffix_result_downloads_monthly_tmp table");
|
||||
String dropepisciencesSuffixDownloads = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_downloads_monthly_tmp";
|
||||
stmt.executeUpdate(dropepisciencesSuffixDownloads);
|
||||
|
||||
logger.info("Creating episciencesSuffix_result_downloads_monthly_tmp table");
|
||||
|
||||
String create_result_downloads_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_downloads_monthly_tmp "
|
||||
+ "AS SELECT entity_id, "
|
||||
+ "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
|
||||
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
|
||||
+ "AS openaire_referrer, "
|
||||
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episcienceslogdistinct where action='download' and (source_item_type='oaItem' or "
|
||||
+ "source_item_type='repItem') and entity_id like '%" + episciencesSuffix + "%'"
|
||||
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
|
||||
+ "source ORDER BY source, entity_id";
|
||||
|
||||
stmt.executeUpdate(create_result_downloads_monthly);
|
||||
logger.info("Created episciencesSuffix_result_downloads_monthly_tmp table");
|
||||
|
||||
logger.info("Inserting episciencesSuffix_result_downloads_monthly_tmp into EpisciencesDownloadsTable");
|
||||
String insertIntoEpisciencesDownloadsTable = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".episciencesdownloads SELECT 'Episciences' as source, '"
|
||||
+ journal_openaire_id + "' as repository_id, ro.id as result_id, month as date,"
|
||||
+ " max(views) AS count, max(openaire_referrer) AS openaire "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "." + episciencesSuffix.replace("-", "_") + "_result_downloads_monthly_tmp p,"
|
||||
+ ConnectDB.getStatsDBSchema()
|
||||
+ ".result_oids ro WHERE p.id=ro.oid GROUP BY ro.id, month ORDER BY ro.id, month";
|
||||
|
||||
stmt.executeUpdate(insertIntoEpisciencesDownloadsTable);
|
||||
logger.info("Inserted episciencesSuffix_result_downloads_monthly_tmp into EpisciencesDownloadsTable");
|
||||
|
||||
stmt.executeUpdate(dropepisciencesSuffixDownloads);
|
||||
logger.info("Dropped episciencesSuffix_result_downloads_monthly_tmp view");
|
||||
|
||||
}
|
||||
rs.close();
|
||||
}
|
||||
|
||||
private Connection getConnection() throws SQLException {
|
||||
return ConnectDB.getHiveConnection();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
/**
|
||||
* @author D. Pierrakos
|
||||
*/
|
||||
public class ExecuteWorkflow {
|
||||
|
||||
static String dbHiveUrl;
|
||||
static String dbImpalaUrl;
|
||||
static String usageRawDataDBSchema;
|
||||
static String usageStatsDBSchema;
|
||||
static String usagestatsPermanentDBSchema;
|
||||
static String statsDBSchema;
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
// Sending the logs to the console
|
||||
BasicConfigurator.configure();
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
UsageStatsExporter.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/usagestatsupdate/export/usagestatupdate_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
dbHiveUrl = parser.get("dbHiveUrl");
|
||||
dbImpalaUrl = parser.get("dbImpalaUrl");
|
||||
usageRawDataDBSchema = parser.get("usageRawDataDB");
|
||||
usageStatsDBSchema = parser.get("usageStatsDB");
|
||||
usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema");
|
||||
statsDBSchema = parser.get("statsDB");
|
||||
|
||||
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
|
||||
usagestatsExport.export();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
package eu.dnetlib.oa.graph.usagestatsbuild.export;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Main class for downloading and processing Usage statistics
|
||||
*
|
||||
* @author D. Pierrakos
|
||||
*/
|
||||
public class UsageStatsExporter {
|
||||
|
||||
public UsageStatsExporter() {
|
||||
|
||||
}
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
|
||||
|
||||
public void export() throws Exception {
|
||||
|
||||
logger.info("Initialising DB properties");
|
||||
ConnectDB.init();
|
||||
|
||||
EpisciencesViewsDownloads episciencesViewsDownloads = new EpisciencesViewsDownloads();
|
||||
logger.info("Processing Episciences logs");
|
||||
episciencesViewsDownloads.processEpisciencesLogs();
|
||||
logger.info("Episciences logs Done");
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "pmi",
|
||||
"paramLongName": "portalMatomoID",
|
||||
"paramDescription": "namoNode of the target cluster",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbhu",
|
||||
"paramLongName": "dbHiveUrl",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbiu",
|
||||
"paramLongName": "dbImpalaUrl",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "urdbs",
|
||||
"paramLongName": "usageRawDataDBSchema",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "usdbs",
|
||||
"paramLongName": "usageStatsDBSchema",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sdbs",
|
||||
"paramLongName": "statsDBSchema",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "uspdbs",
|
||||
"paramLongName": "usagestatsPermanentDBSchema",
|
||||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rdbt",
|
||||
"paramLongName": "recreateDbAndTables",
|
||||
"paramDescription": "Re-create database and initial tables?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ppwl",
|
||||
"paramLongName": "processPiwikLogs",
|
||||
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "plrl",
|
||||
"paramLongName": "processLaReferenciaLogs",
|
||||
"paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ipr",
|
||||
"paramLongName": "irusProcessStats",
|
||||
"paramDescription": "Irus section: Process stats?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ipr",
|
||||
"paramLongName": "sarcProcessStats",
|
||||
"paramDescription": "Sarc section: Process stats?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "fs",
|
||||
"paramLongName": "finalizeStats",
|
||||
"paramDescription": "Create the usage_stats table?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ftvi",
|
||||
"paramLongName": "finalTablesVisibleToImpala",
|
||||
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nodt",
|
||||
"paramLongName": "numberOfDownloadThreads",
|
||||
"paramDescription": "Number of download threads",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -1,83 +0,0 @@
|
|||
<workflow-app name="Usage Stats Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>Hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>Hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>impalaJdbcUrl</name>
|
||||
<description>Impala server jdbc url</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hiveMetastoreUris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.executor.memory</name>
|
||||
<value>19166291558</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.yarn.executor.memoryOverhead</name>
|
||||
<value>3225</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.driver.memory</name>
|
||||
<value>11596411699</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.yarn.driver.memoryOverhead</name>
|
||||
<value>1228</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="Step1"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name='Step1'>
|
||||
<java>
|
||||
<main-class>eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow</main-class>
|
||||
<arg>--portalMatomoID</arg><arg>${portalMatomoID}</arg>
|
||||
<arg>--dbHiveUrl</arg><arg>${hiveJdbcUrl}</arg>
|
||||
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
|
||||
<arg>--usageRawDataDBSchema</arg><arg>${usageRawDataDBSchema}</arg>
|
||||
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
|
||||
<arg>--usagestatsPermanentDBSchema</arg><arg>${usagestatsPermanentDBSchema}</arg>
|
||||
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
|
||||
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
|
||||
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
|
||||
<arg>--processLaReferenciaLogs</arg><arg>${processLaReferenciaLogs}</arg>
|
||||
<arg>--irusProcessStats</arg><arg>${irusProcessStats}</arg>
|
||||
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
|
||||
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
|
||||
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
|
||||
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,44 @@
|
|||
[
|
||||
{
|
||||
"paramName": "pmi",
|
||||
"paramLongName": "portalMatomoID",
|
||||
"paramDescription": "OpenAIRE Explore Matomo",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbhu",
|
||||
"paramLongName": "dbHiveUrl",
|
||||
"paramDescription": "HIVE URL",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbiu",
|
||||
"paramLongName": "dbImpalaUrl",
|
||||
"paramDescription": "Impala URL",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "urdbs",
|
||||
"paramLongName": "usageRawDataDB",
|
||||
"paramDescription": "Usage Raw DB",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "usdbs",
|
||||
"paramLongName": "usageStatsDB",
|
||||
"paramDescription": "Usage Stats DB",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sdbs",
|
||||
"paramLongName": "statsDB",
|
||||
"paramDescription": "Stats DB",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "uspdbs",
|
||||
"paramLongName": "usagestatsPermanentDB",
|
||||
"paramDescription": "Shadow Usage Stats DB",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,21 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
|
||||
impala-shell -q "invalidate metadata;"
|
||||
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f -
|
||||
echo "Impala shell finished"
|
||||
|
||||
echo "Updating shadow observatory database"
|
||||
impala-shell -q "create database if not exists ${SHADOW}"
|
||||
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
|
||||
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
|
||||
echo "Shadow db ready!"
|
|
@ -0,0 +1,8 @@
|
|||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Usage Stats database creation
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
DROP database IF EXISTS ${usageStatsDB} CASCADE;
|
||||
CREATE database ${usageStatsDB};
|
|
@ -0,0 +1,10 @@
|
|||
-- LaReferencia Distinct
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lareferencialogdistinct;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.lareferencialogdistinct(matomoid INT, source INT, id_visit STRING, country STRING, action STRING, url STRING,
|
||||
entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING)
|
||||
clustered by (source, id_visit, action, timestamp, entity_id)
|
||||
into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${usageStatsDB}.lareferencialogdistinct
|
||||
SELECT DISTINCT * FROM ${usageRawDataDB}.lareferencialog WHERE entity_id is not null;
|
|
@ -0,0 +1,18 @@
|
|||
--LaReferencia views
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.la_result_views_monthly_tmp AS
|
||||
SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%'
|
||||
THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct where action='action' and
|
||||
(source_item_type='oaItem' or source_item_type='repItem')
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.la_views_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.la_views_stats_tmp
|
||||
AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.la_result_views_monthly_tmp p, ${statasDB}.datasource_oids d,
|
||||
${statsDB}.result_oids ro WHERE p.source=d.oid AND p.id=ro.oid GROUP BY d.id, ro.id, month;
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
--LaReferencia downloads
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.la_result_downloads_monthly_tmp AS
|
||||
SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%'
|
||||
THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct where action='download' and
|
||||
(source_item_type='oaItem' or source_item_type='repItem')
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.la_downloads_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.la_downloads_stats_tmp
|
||||
AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(downloads) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.la_result_downloads_monthly_tmp p, ${statsDB}.datasource_oids d,
|
||||
${statsDB}.result_oids ro WHERE p.source=d.oid AND p.id=ro.oid GROUP BY d.id, ro.id, month;
|
|
@ -0,0 +1,112 @@
|
|||
--LaReferencia CoP R5
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.lr_view_unique_item_investigations
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct WHERE (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lr_tbl_unique_item_investigations;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.lr_tbl_unique_item_investigations as
|
||||
SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(unique_item_investigations) AS unique_item_investigations, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.lr_view_unique_item_investigations p, ${statsDB}.datasource d,
|
||||
${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.lr_view_total_item_investigations
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) AS total_item_investigations,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct
|
||||
WHERE (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lr_tbl_total_item_investigations;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.lr_tbl_total_item_investigations as
|
||||
SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(total_item_investigations) AS total_item_investigations, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.lr_view_total_item_investigations p, ${statsDB}.datasource d,
|
||||
${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.lr_view_unique_item_requests AS
|
||||
SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct
|
||||
WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lr_tbl_unique_item_requests;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.lr_tbl_unique_item_requests as
|
||||
SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(unique_item_requests) AS unique_item_requests, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.lr_view_unique_item_requests p, ${statsDB}.datasource d,${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.lr_view_total_item_requests
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) AS total_item_requests,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.lareferencialogdistinct
|
||||
WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL
|
||||
GROUP BY id_visit, entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lr_tbl_total_item_requests;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.lr_tbl_total_item_requests as
|
||||
SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(total_item_requests) AS total_item_requests, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.view_total_item_requests p, ${statsDB}.datasource d,
|
||||
${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.lr_tbl_all_r5_metrics;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.lr_tbl_all_r5_metrics as
|
||||
WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(vs.unique_item_investigations, 0) as unique_item_investigations, coalesce(ds.total_item_investigations, 0) as total_item_investigations
|
||||
FROM ${usageStatsDB}.lr_tbl_unique_item_investigations AS vs
|
||||
FULL OUTER JOIN
|
||||
${usageStatsDB}.lr_tbl_total_item_investigations AS ds
|
||||
ON ds.source=vs.source AND ds.result_id=vs.result_id AND ds.date=vs.date),
|
||||
tmp2 AS (select coalesce (ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(ds.total_item_investigations, 0) as total_item_investigations,
|
||||
coalesce(ds.unique_item_investigations, 0) as unique_item_investigations,
|
||||
coalesce(vs.unique_item_requests, 0) as unique_item_requests FROM tmp1 AS ds
|
||||
FULL OUTER JOIN
|
||||
${usageStatsDB}.lr_tbl_unique_item_requests AS vs
|
||||
ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date)
|
||||
SELECT 'LaReferencia' as source, coalesce (ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(ds.unique_item_investigations, 0) as unique_item_investigations,
|
||||
coalesce(ds.total_item_investigations, 0) as total_item_investigations,
|
||||
coalesce(ds.unique_item_requests, 0) as unique_item_requests,
|
||||
coalesce(vs.total_item_requests, 0) as total_item_requests FROM tmp2 AS ds
|
||||
FULL OUTER JOIN
|
||||
${usageStatsDB}.lr_tbl_total_item_requests AS vs ON ds.repository_id=vs.repository_id
|
||||
AND ds.result_id=vs.result_id AND ds.date=vs.date;
|
|
@ -0,0 +1,31 @@
|
|||
--IRUS Stats
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.irus_downloads_stats_tmp
|
||||
(`source` string,
|
||||
`repository_id` string,
|
||||
`result_id` string,
|
||||
`date` string,
|
||||
`count` bigint,
|
||||
`openaire` bigint);
|
||||
|
||||
INSERT INTO ${usageStatsDB}.irus_downloads_stats_tmp
|
||||
SELECT s.source, d.id AS repository_id,
|
||||
ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0'
|
||||
FROM ${usageRawDataDB}.sushilog s, ${statsDB}.datasource_oids d, ${statsDB}.result_oids ro
|
||||
WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK';
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.irus_R5_stats_tmp
|
||||
(`source` string,
|
||||
`repository_id` string,
|
||||
`result_id` string,
|
||||
`date` string,
|
||||
`views` bigint,
|
||||
`downloads` bigint,
|
||||
`openaire` bigint);
|
||||
|
||||
INSERT INTO ${usageStatsDB}.irus_R5_stats_tmp
|
||||
SELECT s.source, d.id AS repository_id,
|
||||
ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date,
|
||||
(s.total_item_investigations-s.total_item_requests) as views, s.total_item_requests as downloads, '0'
|
||||
FROM ${usageRawDataDB}.sushilog_cop_r5 s, ${statsDB}.datasource_oids d, ${statsDB}.result_oids ro
|
||||
WHERE s.repository=d.oid AND s.rid=ro.oid AND s.source='IRUS-UK';
|
|
@ -0,0 +1,18 @@
|
|||
--SARC Downloads
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.sarc_downloads_stats_tmp
|
||||
(`source` string,
|
||||
`repository_id` string,
|
||||
`result_id` string,
|
||||
`date` string,
|
||||
`count` bigint,
|
||||
`openaire` bigint);
|
||||
|
||||
INSERT INTO ${usageStatsDB}.sarc_downloads_stats_tmp
|
||||
SELECT s.source, d.id AS repository_id,
|
||||
ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/',
|
||||
LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0'
|
||||
FROM ${usageRawDataDB}.sushilog s, ${statsDB}.datasource_oids d, ${statsDB}.result_pids ro
|
||||
WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%')
|
||||
AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS';
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
--Finalize USAGE Stats
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.views_stats;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.downloads_stats;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.pageviews_stats;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.usage_stats;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.project_stats;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.download_stats;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.views_stats
|
||||
LIKE ${usageStatsDB}.openaire_views_stats_tmp STORED AS PARQUET;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.openaire_views_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.episciencesviews;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.pedocs_views_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.tudelft_views_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.la_views_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.b2share_views_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.downloads_stats
|
||||
LIKE ${usageStatsDB}.openaire_downloads_stats_tmp STORED AS PARQUET;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.openaire_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.episciencesdownloads;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.pedocs_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.tudelft_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.b2share_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.la_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.irus_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT source, repository_id, result_id, `date`, views, openaire FROM ${usageStatsDB}.irus_R5_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT source, repository_id, result_id, `date`, downloads, openaire FROM ${usageStatsDB}.irus_R5_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.sarc_downloads_stats_tmp;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.views_stats
|
||||
SELECT * FROM ${usageStatsDB}.datacite_views;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.downloads_stats
|
||||
SELECT * FROM ${usageStatsDB}.datacite_downloads;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.pageviews_stats
|
||||
LIKE ${usageStatsDB}.openaire_pageviews_stats_tmp STORED AS PARQUET;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.pageviews_stats
|
||||
SELECT * FROM ${usageStatsDB}.openaire_pageviews_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.usage_stats AS
|
||||
SELECT coalesce(ds.source, vs.source) as source,
|
||||
coalesce(ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views,
|
||||
coalesce(ds.openaire, 0) as openaire_downloads,
|
||||
coalesce(vs.openaire, 0) as openaire_views
|
||||
FROM ${usageStatsDB}.downloads_stats AS ds
|
||||
FULL OUTER JOIN ${usageStatsDB}.views_stats AS vs ON ds.source=vs.source
|
||||
AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.project_stats AS
|
||||
with project_views as (select id, sum(views) views, sum(openaire_views) openaire_views,`date`
|
||||
from ${usageStatsDB}.usage_stats
|
||||
join ${statsDB}.project_results on result_id=result group by id,`date`),
|
||||
project_downloads as
|
||||
(select id, sum(downloads) downloads,sum(openaire_downloads) openaire_downloads,`date`
|
||||
from ${usageStatsDB}.usage_stats
|
||||
join ${statsDB}.project_results on result_id=result group by id,`date`)
|
||||
SELECT coalesce(pv.id, pd.id) as id, coalesce(pd.`date`, pv.`date`) as `date`,
|
||||
coalesce(pv.views, 0) as views, coalesce(pd.downloads, 0) as downloads,
|
||||
coalesce(pv.openaire_views) as openaire_views,coalesce(pd.openaire_downloads, 0) as openaire_downloads
|
||||
FROM project_downloads pd FULL OUTER JOIN project_views pv ON pd.id=pv.id WHERE pd.`date`=pv.`date`;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.datasource_stats AS
|
||||
with datasource_views as
|
||||
(select repository_id, sum(views) views, sum(openaire_views) openaire_views,`date`
|
||||
from ${usageStatsDB}.usage_stats group by repository_id,`date`),
|
||||
datasource_downloads as
|
||||
(select repository_id, sum(downloads) downloads,sum(openaire_downloads) openaire_downloads,`date`
|
||||
from ${usageStatsDB}.usage_stats group by repository_id,`date`)
|
||||
SELECT coalesce(dv.repository_id, dd.repository_id) as repositor_id,
|
||||
coalesce(dd.`date`, dv.`date`) as `date`, coalesce(dv.views, 0) as views,
|
||||
coalesce(dd.downloads, 0) as downloads,
|
||||
coalesce(dv.openaire_views) as openaire_views,coalesce(dd.openaire_downloads, 0) as openaire_downloads
|
||||
FROM datasource_downloads dd FULL OUTER JOIN
|
||||
datasource_views dv ON dd.repository_id=dv.repository_id WHERE dd.`date`=dv.`date`;
|
||||
|
||||
|
||||
INSERT INTO ${usageStatsDB}.tbl_all_r5_metrics
|
||||
SELECT * FROM ${usageStatsDB}.lr_tbl_all_r5_metrics;
|
||||
|
||||
INSERT INTO ${usageStatsDB}.tbl_all_r5_metrics
|
||||
SELECT s.source, d.id AS repository_id,
|
||||
ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date,
|
||||
s.unique_item_investigations , s.total_item_investigations,
|
||||
s.unique_item_requests, s.total_item_requests
|
||||
FROM ${usageStatsDB}.sushilog_cop_r5 s, ${statsDB}.datasource_oids d, ${statsDB}.result_oids ro
|
||||
WHERE s.repository=d.oid AND s.rid=ro.oid AND s.source='IRUS-UK';
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
-- Shadow DB
|
||||
DROP database IF EXISTS ${usagestatsPermanentDBSchema} CASCADE;
|
||||
CREATE database ${usagestatsPermanentDBSchema};
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.views_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.views_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.views_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.pageviews_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.pageviews_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.pageviews_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.downloads_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.downloads_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.downloads_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.usage_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.usage_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.usage_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.project_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.project_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.project_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.datasource_stats;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.datasource_stats
|
||||
AS SELECT * FROM ${usageStatsDB}.datasource_stats;
|
||||
|
||||
DROP VIEW IF EXISTS ${usagestatsPermanentDBSchema}.counter_r5_stats_with_metrics;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${usagestatsPermanentDBSchema}.counter_r5_stats_with_metrics
|
||||
AS SELECT * FROM ${usageStatsDB}.tbl_all_r5_metrics;
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
SHADOW=$1
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.views_stats;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.pageviews_stats;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.downloads_stats;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.usage_stats;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.counter_r5_stats_with_metrics;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.project_stats;"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "INVALIDATE METADATA ${SHADOW}.datasource_stats;"
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.piwiklogdistinct(source INT, id_visit STRING, country STRING,
|
||||
action STRING, url STRING, entity_id STRING, source_item_type STRING, timestamp STRING,
|
||||
referrer_name STRING, agent STRING) clustered by (source, id_visit, action, timestamp, entity_id)
|
||||
into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${usageStatsDB}.piwiklogdistinct
|
||||
SELECT DISTINCT * FROM ${usageRawDataDB}.piwiklog WHERE entity_id is not null;
|
|
@ -0,0 +1,28 @@
|
|||
--OpenAIRE Views
|
||||
|
||||
DROP VIEW IF EXISTS ${usageStatsDB}.openaire_piwikresult_views_monthly_tmp;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.openaire_result_views_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END)
|
||||
AS openaire_referrer, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct where action='action'
|
||||
and (source_item_type='oaItem' or source_item_type='repItem')
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.openaire_views_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.openaire_views_stats_tmp
|
||||
AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.openaire_result_views_monthly_tmp p, ${statsDB}.datasource d,
|
||||
${statsDB}.result_oids ro WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200'
|
||||
AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503'
|
||||
AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.openaire_pageviews_stats_tmp AS SELECT
|
||||
'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count
|
||||
FROM ${usageStatsDB}.openaire_result_views_monthly_tmp p, ${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.source=${portalMatomoID} AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200'
|
||||
AND ro.oid!='204' AND ro.oid!='404' AND ro.oid!='400' AND ro.oid!='503'
|
||||
AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' GROUP BY d.id, ro.id, month;
|
|
@ -0,0 +1,24 @@
|
|||
--OpenAIRE Downloads
|
||||
|
||||
DROP VIEW IF EXISTS ${usageStatsDB}.openaire_result_downloads_monthly_tmp;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.openaire_result_downloads_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END)
|
||||
AS openaire_referrer, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct where action='download'
|
||||
AND (source_item_type='oaItem' OR source_item_type='repItem')
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.openaire_downloads_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.openaire_downloads_stats_tmp AS
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(downloads) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.openaire_result_downloads_monthly_tmp p,
|
||||
${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
DROP VIEW IF EXISTS ${usageStatsDB}.openaire_result_downloads_monthly_tmp;
|
|
@ -0,0 +1,108 @@
|
|||
--CoP R5
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.view_unique_item_investigations
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_investigations,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tbl_unique_item_investigations ";
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.tbl_unique_item_investigations as
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(unique_item_investigations) AS unique_item_investigations, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.view_unique_item_investigations p, ${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.view_total_item_investigations
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) AS total_item_investigations,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct WHERE (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tbl_total_item_investigations;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.tbl_total_item_investigations AS
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(total_item_investigations) AS total_item_investigations, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.view_total_item_investigations p, ${statsDB}.datasource d,${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.view_unique_item_requests AS
|
||||
SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
CASE WHEN COUNT(entity_id)>1 THEN 1 ELSE 1 END AS unique_item_requests,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE action='download' AND (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tbl_unique_item_requests";
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.tbl_unique_item_requests as
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(unique_item_requests) AS unique_item_requests, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.view_unique_item_requests p, ${statsDB}.datasource d,${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.view_total_item_requests
|
||||
AS SELECT id_visit, entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) AS total_item_requests,
|
||||
SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct WHERE action='download'
|
||||
AND (source_item_type='oaItem' or source_item_type='repItem')
|
||||
AND entity_id is NOT NULL GROUP BY id_visit, entity_id,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
DROP TABLE IF EXISTS $usageStatsDB.tbl_total_item_requests;
|
||||
|
||||
CREATE TABLE ${usageStatsDB}.tbl_total_item_requests as
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
sum(total_item_requests) AS total_item_requests, sum(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.view_total_item_requests p, ${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND ro.oid!='204' AND ro.oid!='404'
|
||||
AND ro.oid!='400' AND ro.oid!='503' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tbl_all_r5_metrics;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${statsDB}.tbl_all_r5_metrics as
|
||||
WITH tmp1 as (SELECT coalesce(ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(vs.unique_item_investigations, 0) as unique_item_investigations,
|
||||
coalesce(ds.total_item_investigations, 0) as total_item_investigations
|
||||
FROM ${usageStatsDB}.tbl_unique_item_investigations AS vs
|
||||
FULL OUTER JOIN
|
||||
${usageStatsDB}.tbl_total_item_investigations AS ds
|
||||
ON ds.source=vs.source AND ds.result_id=vs.result_id AND ds.date=vs.date),
|
||||
tmp2 AS (select coalesce (ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(ds.total_item_investigations, 0) as total_item_investigations,
|
||||
coalesce(ds.unique_item_investigations, 0) as unique_item_investigations,
|
||||
coalesce(vs.unique_item_requests, 0) as unique_item_requests FROM tmp1
|
||||
AS ds FULL OUTER JOIN ${usageStatsDB}.tbl_unique_item_requests AS vs
|
||||
ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date)
|
||||
SELECT 'OpenAIRE' as source, coalesce (ds.repository_id, vs.repository_id) as repository_id,
|
||||
coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date,
|
||||
coalesce(ds.unique_item_investigations, 0) as unique_item_investigations,
|
||||
coalesce(ds.total_item_investigations, 0) as total_item_investigations,
|
||||
coalesce(ds.unique_item_requests, 0) as unique_item_requests,
|
||||
coalesce(vs.total_item_requests, 0) as total_item_requests
|
||||
FROM tmp2 AS ds FULL OUTER JOIN ${usageStatsDB}.tbl_total_item_requests
|
||||
AS vs ON ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date;
|
|
@ -0,0 +1,13 @@
|
|||
--Episciences log
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.episcienceslogdistinct;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
${usageStatsDB}.episcienceslogdistinct(source INT, id_visit STRING,
|
||||
country STRING, action STRING, url STRING,
|
||||
entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING)
|
||||
clustered by (source, id_visit, action, timestamp, entity_id)
|
||||
into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${usageStatsDB}.episcienceslogdistinct
|
||||
SELECT DISTINCT * FROM ${usageStatsDB}.episcienceslog WHERE entity_id is not null;
|
|
@ -0,0 +1,15 @@
|
|||
--PeDOCS old data
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.pedocs_views_stats_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.pedocs_downloads_stats_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.pedocs_views_stats_tmp AS
|
||||
SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,
|
||||
r.id as result_id,date,counter_abstract as count, 0 as openaire
|
||||
FROM ${usageRawDataDB}.pedocsoldviews p, ${statsDB}.result_oids r where r.oid=p.identifier;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.pedocs_downloads_stats_tmp AS
|
||||
SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,
|
||||
r.id as result_id, date, counter as count, 0 as openaire
|
||||
FROM ${usageRawDataDB}.pedocsolddownloads p, ${statsDB}.result_oids r where r.oid=p.identifier;
|
|
@ -0,0 +1,43 @@
|
|||
--TU DELFT
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.tudelft_result_views_monthly_tmp;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.tudelft_result_downloads_monthly_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tudelft_views_stats_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.tudelft_downloads_stats_tmp;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.tudelft_result_views_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.tudelft_views_stats_tmp AS
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.tudelft_result_views_monthly_tmp p, ${statsDB}.datasource d,
|
||||
${statsDB}.result_oids ro WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201'
|
||||
GROUP BY d.id, ro.id, month ;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.tudelft_result_downloads_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.tudelft_downloads_stats_tmp AS
|
||||
SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire FROM ${usageStatsDB}.tudelft_result_downloads_monthly_tmp p,
|
||||
${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.tudelft_result_views_monthly_tmp;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.tudelft_result_downloads_monthly_tmp;
|
|
@ -0,0 +1,43 @@
|
|||
--B2SHARE
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.b2share_result_views_monthly_tmp;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.b2share_result_downloads_monthly_tmp";
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.b2share_views_stats_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${usageStatsDB}.b2share_downloads_stats_tmp;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.b2share_result_views_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=412
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.b2share_views_stats_tmp AS
|
||||
SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire
|
||||
FROM ${usageStatsDB}.b2share_result_views_monthly_tmp p, ${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
CREATE OR REPLACE VIEW ${usageStatsDB}.b2share_result_downloads_monthly_tmp
|
||||
AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id,
|
||||
COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer,
|
||||
CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source
|
||||
FROM ${usageStatsDB}.piwiklogdistinct
|
||||
WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=412
|
||||
GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${usageStatsDB}.b2share_downloads_stats_tmp AS
|
||||
SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date,
|
||||
max(views) AS count, max(openaire_referrer) AS openaire FROM ${usageStatsDB}.b2share_result_downloads_monthly_tmp p,
|
||||
${statsDB}.datasource d, ${statsDB}.result_oids ro
|
||||
WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877'
|
||||
GROUP BY d.id, ro.id, month;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.b2share_result_views_monthly_tmp;
|
||||
|
||||
DROP view IF EXISTS ${usageStatsDB}.b2share_result_downloads_monthly_tmp";
|
|
@ -0,0 +1,307 @@
|
|||
<workflow-app name="Usage Stats Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>Hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>Hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>impalaJdbcUrl</name>
|
||||
<description>Impala server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>portalMatomoID</name>
|
||||
<description>Matomo ID for OpenAIRE Explore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>usageRawDataDB</name>
|
||||
<description>Raw Usage Data DB</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>usageStatsDB</name>
|
||||
<description>Usage Stats DB</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>statsDB</name>
|
||||
<description>Stats DB</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>usagestatsPermanentDB</name>
|
||||
<description>Shadow Usage Stats DB</description>
|
||||
</property>
|
||||
</parameters>
|
||||
<!-- portalMatomoID=109-->
|
||||
<!-- usageRawDataDBSchema=openaire_prod_usage_raw-->
|
||||
<!-- usageStatsDBSchema=openaire_prod_usage_stats_20230109-->
|
||||
<!-- statsDBSchema=openaire_prod_stats-->
|
||||
<!-- usagestatsPermanentDBSchema=openaire_prod_usage_stats_shadow-->
|
||||
<!-- recreateDbAndTables=true-->
|
||||
<!-- processPiwikLogs=true-->
|
||||
<!-- processLaReferenciaLogs=true-->
|
||||
<!-- irusProcessStats=true-->
|
||||
<!-- sarcProcessStats=true-->
|
||||
<!-- finalizeStats=true-->
|
||||
<!-- finalTablesVisibleToImpala=true-->
|
||||
<!-- numberOfDownloadThreads=1-->
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hiveMetastoreUris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.executor.memory</name>
|
||||
<value>19166291558</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.yarn.executor.memoryOverhead</name>
|
||||
<value>3225</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.driver.memory</name>
|
||||
<value>11596411699</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark.yarn.driver.memoryOverhead</name>
|
||||
<value>1228</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="Step1"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="Step1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step1.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step2">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step2.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step3">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step3.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
<param>portalMatomoID=${portalMatomoID}</param>
|
||||
</hive2>
|
||||
<ok to="Step4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step4">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step4.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step5">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step5.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step6">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step6.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step7"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name='Step-EpisciencesViewsDownloads'>
|
||||
<java>
|
||||
<main-class>eu.dnetlib.oa.graph.usagestatsupdate.export.ExecuteWorkflow</main-class>
|
||||
<arg>--dbHiveUrl</arg><arg>${hiveJdbcUrl}</arg>
|
||||
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
|
||||
<arg>--usageRawDataDBSchema</arg><arg>${usageRawDataDBSchema}</arg>
|
||||
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
|
||||
<arg>--usagestatsPermanentDB</arg><arg>${usagestatsPermanentDB}</arg>
|
||||
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step7">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step7.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step8"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step8">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step8.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step9">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step9.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step10"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step10">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step10.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step11"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step11">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step11.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step12"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step12">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step12.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step13"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step13">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step13.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step14"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step14">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step14.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step15"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step15">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step15.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>statsDB=${statsDB}</param>
|
||||
<param>usageRawDataDB=${usageRawDataDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step16"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step16.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step17"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step17">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
|
||||
<script>scripts/Step18.sql</script>
|
||||
<param>usageStatsDB=${usageStatsDB}</param>
|
||||
<param>usagestatsPermanentDB=${usagestatsPermanentDB}</param>
|
||||
</hive2>
|
||||
<ok to="Step18"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step18">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>invalidate_metadata.sh</exec>
|
||||
<argument>${usagestatsPermanentDB}</argument>
|
||||
<file>invalidate_metadata.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue