From 533bde32283945d74b41a5f53c0ac9a77c5668b9 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 4 Jun 2021 15:47:32 +0300 Subject: [PATCH] Commit datasets changes --- dhp-workflows/dhp-indicators/pom.xml | 107 ------------- dhp-workflows/dhp-indicators/runworkflow.sh | 1 - .../indicators/export/ExecuteWorkflow.java | 35 ----- .../indicators/oozie_app/config-default.xml | 38 ----- .../indicators/oozie_app/python/testpython.py | 5 - .../indicators/oozie_app/python/testscript.sh | 2 - .../graph/indicators/oozie_app/workflow.xml | 58 ------- .../dhp-usage-datasets-stats-update/pom.xml | 4 +- .../datasetsusagestats/export/ConnectDB.java | 33 +++- .../export/DatasetsStatsDB.java | 46 +++++- .../export/ExecuteWorkflow.java | 12 +- .../export/ReadReportsListFromDatacite.java | 85 +++++++++-- .../export/UsageStatsExporter.java | 46 ++++++ .../datasets_usagestats_parameters.json | 114 +++++++------- .../datasetsusagestats/oozie_app/workflow.xml | 4 +- .../usagerawdata/export/ExecuteWorkflow.java | 4 + .../export/PiwikDownloadLogs.java | 2 +- .../usagerawdata/export/PiwikStatsDB.java | 43 ++++++ .../export/UsageStatsExporter.java | 15 +- .../export/usagerawdata_parameters.json | 6 + .../graph/usagerawdata/oozie_app/workflow.xml | 3 +- .../usagestatsbuild/export/ConnectDB.java | 2 +- .../usagestatsbuild/export/PiwikStatsDB.java | 143 ++++++++++++++++-- .../export/UsageStatsExporter.java | 3 + .../usagestatsbuild/oozie_app/workflow.xml | 2 +- 25 files changed, 463 insertions(+), 350 deletions(-) delete mode 100644 dhp-workflows/dhp-indicators/pom.xml delete mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml mode change 100644 => 100755 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml deleted file mode 100644 index 937795791..000000000 --- a/dhp-workflows/dhp-indicators/pom.xml +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-indicators - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-indicators - diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh deleted file mode 100755 index 0cad5792d..000000000 --- a/dhp-workflows/dhp-indicators/runworkflow.sh +++ /dev/null @@ -1 +0,0 @@ -mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java deleted file mode 100644 index 61e6ef72c..000000000 --- a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.indicators.export; - -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos - */ -public class ExecuteWorkflow { - - private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - logger.info("Workflow Executed"); - } - -} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py deleted file mode 100644 index e913df6ae..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py +++ /dev/null @@ -1,5 +0,0 @@ -#! /usr/bin/env python -import sys - -print "this is a Python script" -print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh deleted file mode 100644 index 78938c85a..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml deleted file mode 100644 index 2b8ed7d99..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - ${jobTracker} - ${nameNode} - - - mapred.job.queue.name - ${queueName} - - - testpython.py - python/testpython.py - - - - - - - Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index b39c3ff9b..c623a12f0 100755 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -19,7 +19,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT ../ 4.0.0 @@ -96,7 +96,7 @@ eu.dnetlib.dhp dhp-common - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT jar diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index cab0bc83f..de9e44fbf 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -9,6 +9,10 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; import org.apache.log4j.Logger; @@ -28,6 +32,7 @@ public abstract class ConnectDB { private static String dbHiveUrl; private static String dbImpalaUrl; private static String datasetUsageStatsDBSchema; + private static String datasetsUsageStatsPermanentDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); private Statement stmt = null; @@ -37,6 +42,7 @@ public abstract class ConnectDB { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; + datasetsUsageStatsPermanentDBSchema = ExecuteWorkflow.datasetsUsageStatsPermanentDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema; Class.forName("org.apache.hive.jdbc.HiveDriver"); @@ -63,14 +69,25 @@ public abstract class ConnectDB { } public static String getDataSetUsageStatsDBSchema() { - return ConnectDB.datasetUsageStatsDBSchema; + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); + + return ConnectDB.datasetUsageStatsDBSchema + "_" + todayAsString; } public static String getStatsDBSchema() { return ConnectDB.statsDBSchema; } + public static String getDatasetsUsagestatsPermanentDBSchema() { + return ConnectDB.datasetsUsageStatsPermanentDBSchema; + } + private static Connection connectHive() throws SQLException { + logger.info("trying to open Hive connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); @@ -90,14 +107,18 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened HIVE successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Hive successfully"); +// +// return connection; } private static Connection connectImpala() throws SQLException { - + logger.info("trying to open Impala connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); cpds.setUser("dimitris.pierrakos"); @@ -116,8 +137,12 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened Impala successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Impala successfully"); +// +// return connection; } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java index 17661b99e..baffa39e0 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -1,8 +1,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; -import java.sql.Connection; -import java.sql.SQLException; import java.sql.Statement; import org.slf4j.Logger; @@ -47,7 +45,7 @@ public class DatasetsStatsDB { try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + logger.info("Creating datacite usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -55,6 +53,23 @@ public class DatasetsStatsDB { logger.error("Failed to create database: " + e); throw new Exception("Failed to create database: " + e.toString(), e); } + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger + .info( + "Creating permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getDatasetsUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger + .info( + "Created permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } } private void createTables() throws Exception { @@ -62,10 +77,10 @@ public class DatasetsStatsDB { stmt = ConnectDB.getHiveConnection().createStatement(); // Create Reports table - This table should exist - logger.info("Creating Reports Table"); + logger.info("Creating Reports Tmp Table"); String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" + + ".datacitereports_tmp(reportid STRING, \n" + " name STRING, \n" + " source STRING,\n" + " release STRING,\n" @@ -79,10 +94,10 @@ public class DatasetsStatsDB { logger.info("Reports Table Created"); // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Table"); + logger.info("Creating DataSetsPerformance Tmp Table"); String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance(ds_type STRING,\n" + + ".datasetsperformance_tmp(ds_type STRING,\n" + " ds_title STRING,\n" + " yop STRING,\n" + " dataset_type STRING, \n" @@ -100,7 +115,22 @@ public class DatasetsStatsDB { + " CLUSTERED BY (ds_type)\n" + " into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Table Created"); + logger.info("DataSetsPerformance Tmp Table Created"); + + logger.info("Creating Datacite Reports table"); + String createDataciteReportsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table created"); + + logger.info("Creating Datasets Performance table"); + String createDatasetPerformanceTable = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table created"); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index b28578e4b..ffa8b8199 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -21,6 +21,7 @@ public class ExecuteWorkflow { static String dbHiveUrl; static String dbImpalaUrl; static String datasetUsageStatsDBSchema; + static String datasetsUsageStatsPermanentDBSchema; static String statsDBSchema; static boolean recreateDbAndTables; static boolean datasetsEmptyDirs; @@ -45,6 +46,7 @@ public class ExecuteWorkflow { dbHiveUrl = parser.get("dbHiveUrl"); dbImpalaUrl = parser.get("dbImpalaUrl"); datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); + datasetsUsageStatsPermanentDBSchema = parser.get("datasetsUsageStatsPermanentDBSchema"); statsDBSchema = parser.get("statsDBSchema"); if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) @@ -57,11 +59,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; -// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) -// finalTablesVisibleToImpala = true; -// else -// finalTablesVisibleToImpala = false; -// + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else + finalTablesVisibleToImpala = false; + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index 1b769bf53..e89e2e5a4 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -65,7 +65,7 @@ public class ReadReportsListFromDatacite { logger.info("Checking report with id " + reportID); String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports where reportid=?"; + + ".datacitereports_tmp where reportid=?"; PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); stGetReportID.setString(1, reportID); @@ -76,7 +76,7 @@ public class ReadReportsListFromDatacite { dropTmpReportsTable(); } else { String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports " + + " .datacitereports_tmp " + "SELECT\n" + " get_json_object(json, '$.report.id') AS reportid,\n" + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" @@ -127,7 +127,7 @@ public class ReadReportsListFromDatacite { public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { logger.info("Reading Datasets performance for report " + reportId); logger.info("Write Performance Report To File"); - + ConnectDB.getHiveConnection().setAutoCommit(false); ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -177,7 +177,7 @@ public class ReadReportsListFromDatacite { stmt.execute(sqlCreateTempTableForDatasets); String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + + ".datasetsperformance_tmp SELECT dataset.dataset_id[0].value ds_type, " + " dataset.dataset_title ds_title, " + " dataset.yop yop, " + " dataset.data_type dataset_type, " @@ -296,32 +296,93 @@ public class ReadReportsListFromDatacite { } public void createUsageStatisticsTable() throws SQLException { - logger.info("Dropping Downloads Stats table"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads"; - stmt.executeUpdate(dropDownloadsTable); + + logger.info("Updating Datacite Reports table"); + String createDataciteReportsTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports_tmp"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table updated"); + + logger.info("Updating Datasets Performance table"); + String createDatasetPerformanceTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance_tmp"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table updated"); logger.info("Creating Downloads Stats table"); String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads as " + + ".datacite_downloads STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-requests'"; + + "where metric_type='total-dataset-requests' "; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); logger.info("Creating Views Stats table"); - String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_views STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-investigations'"; + + "where metric_type='total-dataset-investigations' "; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); + + logger.info("Building Permanent Datasets Usage Stats DB"); + + logger.info("Dropping view datacitereports on permanent datacite usagestats DB"); + String sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacitereports on permanent datacite usagestats DB"); + + logger.info("Create view datacitereports on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Created view datacitereports on permanent datasets usagestats DB"); + + logger.info("Dropping view datasetsperformance on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Dropped view datasetsperformance on permanent datacite usagestats DB"); + + logger.info("Create view datasetsperformance on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Created view datasetsperformance on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_views on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_views on permanent datacite usagestats DB"); + + logger.info("Create view datacite_views on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_views on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_downloads on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_downloads on permanent datacite usagestats DB"); + + logger.info("Create view datacite_downloads on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_downloads on permanent datasets usagestats DB"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Completed Building Permanent Datasets Usage Stats DB"); } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java old mode 100644 new mode 100755 index d96d7e875..8d6e24333 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -2,6 +2,7 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; +import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; @@ -67,5 +68,50 @@ public class UsageStatsExporter { readReportsListFromDatacite.readReports(); logger.info("Reports Stored To DB"); readReportsListFromDatacite.createUsageStatisticsTable(); + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + stmt.close(); + try { + ConnectDB.getHiveConnection().close(); + } catch (Exception e) { + logger.info("Message at the end :" + e.getMessage()); + } } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json index f8d51a882..f67651627 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -1,56 +1,62 @@ [ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "datasetsUsageStatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml index 36c1ccea5..22bf22c01 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -52,6 +52,8 @@ ${impalaJdbcUrl} --datasetUsageStatsDBSchema ${datasetUsageStatsDBSchema} + --datasetsUsageStatsPermanentDBSchema + ${datasetsUsageStatsPermanentDBSchema} --statsDBSchema ${statsDBSchema} --recreateDbAndTables diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java index e0e0d3687..d2884a4bb 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -65,6 +65,8 @@ public class ExecuteWorkflow { static int numberOfDownloadThreads; + static int b2SSHAREID; + public static void main(String args[]) throws Exception { // Sending the logs to the console @@ -196,6 +198,8 @@ public class ExecuteWorkflow { numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + b2SSHAREID = Integer.parseInt(parser.get("b2shareID")); + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); // usagestatsExport.createdDBWithTablesOnly(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index a84d6743f..76412cd54 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -191,7 +191,7 @@ public class PiwikDownloadLogs { ResultSet rs = statement .executeQuery( "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + ".datasource where piwik_id is not null and piwik_id <> 0 and piwik_id <> 196 order by piwik_id"); // Getting all the piwikids in a list for logging reasons & limitting the list // to the max number of piwikids diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java index 9144620b7..00378ca1f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -179,6 +179,10 @@ public class PiwikStatsDB { createPedocsOldUsageData(); logger.info("Pedocs Tables Created"); + logger.info("Create Datacite Tables"); + createDatasetsUsageData(); + logger.info("Datacite Tables Created"); + } catch (Exception e) { logger.error("Failed to process logs: " + e); throw new Exception("Failed to process logs: " + e.toString(), e); @@ -281,6 +285,7 @@ public class PiwikStatsDB { // clean view double clicks logger.info("Cleaning action double clicks"); + ConnectDB.getHiveConnection().setAutoCommit(false); sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "WHERE EXISTS (\n" + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" @@ -750,6 +755,16 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Dropped sarc_sushilogtmp_json_non_array"); + logger.info("Dropping piwiklogb2sharetmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogb2sharetmp"); + + logger.info("Dropping piwiklog_b2share_tmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog_b2share_tmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklog_b2share_tmp_json"); + stmt.close(); ConnectDB.getHiveConnection().close(); @@ -832,4 +847,32 @@ public class PiwikStatsDB { logger.info("PeDocs Old Downloads Table created"); } + + public void createDatasetsUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping datacite_views"); + String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Dropped datacite_views"); + + logger.info("Dropping datacite_downloads"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Dropped datacite_downloads"); + + logger.info("Creating Datasets Views Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_views as select * from openaire_prod_datacite_usage_stats.datacite_views"; + stmt.executeUpdate(sql); + logger.info("Datasets Views Table created"); + + logger.info("Creating Datasets Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_downloads as select * from openaire_prod_datacite_usage_stats.datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Datasets Downloads Table created"); + + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index 07e15605f..2f10e4d2b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -142,8 +142,20 @@ public class UsageStatsExporter { sarcStats.updateSarcLogs(); } logger.info("Sarc done"); - // finalize usagestats + PiwikDownloadLogs_B2SHARE b2sharePiwikID = new PiwikDownloadLogs_B2SHARE(ExecuteWorkflow.matomoBaseURL, + ExecuteWorkflow.matomoAuthToken); + b2sharePiwikID.GetOpenAIREB2SHARELogs(ExecuteWorkflow.repoLogPath); + logger.info("B2SHARE done"); + + PiwikStatsDB_B2SHARE piwikstatsB2SHAREdb = new PiwikStatsDB_B2SHARE(ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath); + piwikstatsB2SHAREdb.setCounterRobotsURL(cRobotsUrl); + + logger.info("Processing B2SHARE logs"); + piwikstatsB2SHAREdb.processB2SHARELogs(); + + // finalize usagestats logger.info("Dropping tmp tables"); if (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); @@ -161,6 +173,7 @@ public class UsageStatsExporter { piwikstatsdb.recreateDBAndTables(); piwikstatsdb.createPedocsOldUsageData(); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); logger.info("Creating LaReferencia tables"); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json index 1aa5ad6f8..8c733c55b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -215,5 +215,11 @@ "paramLongName": "numberOfDownloadThreads", "paramDescription": "Number of download threads", "paramRequired": true + }, + { + "paramName": "b2shareID", + "paramLongName": "b2shareID", + "paramDescription": "B2SHARE Matomo ID", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml index 022a107ab..80e1da478 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -78,6 +78,7 @@ --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} --numberOfDownloadThreads${numberOfDownloadThreads} + --b2shareID${b2shareID} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java index e53709f1a..ea07ed732 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -82,7 +82,7 @@ public abstract class ConnectDB { Date today = Calendar.getInstance().getTime(); String todayAsString = df.format(today); - return ConnectDB.usageStatsDBSchema + "_" + todayAsString; + return ConnectDB.usageStatsDBSchema + todayAsString; } public static String getStatsDBSchema() { diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index 5a6953f4c..7c6f28023 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -35,20 +35,20 @@ public class PiwikStatsDB { private void createDatabase() throws Exception { -// try { -// -// stmt = ConnectDB.getHiveConnection().createStatement(); -// -// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); -// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; -// stmt.executeUpdate(dropDatabase); -// } catch (Exception e) { -// logger.error("Failed to drop database: " + e); -// throw new Exception("Failed to drop database: " + e.toString(), e); -// } -// try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -337,6 +337,96 @@ public class PiwikStatsDB { } + public void uploadB2SHAREStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_views_monthly_tmp view"); + String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp"; + logger.info("Dropped b2share_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping b2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp"; + logger.info("Dropped b2share_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_views_stats_tmp table + logger.info("Dropping B2SHARE b2share_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp"; + logger.info("Dropped b2share_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_downloads_stats_tmp table + logger.info("Dropping B2SHARE b2share_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp"; + logger.info("Dropped b2share_downloads_stats_tmp table "); + stmt.executeUpdate(sql); + + // Creating B2SHARE b2share_result_views_monthly_tmp view + logger.info("Creating B2SHARE b2share_result_views_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=412 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created b2share_result_views_monthly_tmp view "); + + // Creating B2SHARE b2share_views_stats_tmp table + logger.info("Creating B2SHARE b2share_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp AS " + + "SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".b2share_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created B2SHARE b2share_views_stats_tmp table"); + + // Creating B2SHARE b2share_result_downloads_monthly_tmp view + logger.info("Creating B2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=412 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created b2share_result_downloads_monthly_tmp view "); + + // Creating B2SHARE b2share_downloads_stats_tmp table + logger.info("Creating B2SHARE b2share_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp AS " + + "SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".b2share_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created B2SHARE b2share_downloads_stats_tmp table"); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_views_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp"; + logger.info("Dropped b2share_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp"; + logger.info("Dropped b2share_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + } + public void finalizeStats() throws Exception { stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); @@ -402,6 +492,13 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("LaReferencia views updated to views_stats"); + // Inserting B2SHARE views stats + logger.info("Inserting B2SHARE data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("B2SHARE views updated to views_stats"); + logger.info("Creating downloads_stats table"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() @@ -425,12 +522,18 @@ public class PiwikStatsDB { logger.info("Inserted Pedocs data to downloads_stats"); // Inserting TUDELFT downloads stats - logger.info("Inserting TUDELFT old data to downloads_stats"); + logger.info("Inserting TUDELFT data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; stmt.executeUpdate(sql); logger.info("Inserted TUDELFT data to downloads_stats"); + // Inserting B2SHARE downloads stats + logger.info("Inserting B2SHARE data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted B2SHARE data to downloads_stats"); // Inserting Lareferencia downloads stats logger.info("Inserting LaReferencia data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " @@ -452,6 +555,20 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("SARC-OJS downloads updated to downloads_stats"); + // Inserting Datacite views stats + logger.info("Inserting Datacite views to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Datacite views updated to views_stats"); + + // Inserting Datacite downloads stats + logger.info("Inserting Datacite downloads to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Datacite downloads updated to downloads_stats"); + logger.info("Creating pageviews_stats table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java index 47986f52a..0df6c8b2d 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -51,6 +51,9 @@ public class UsageStatsExporter { logger.info("Processing TUDELFT Stats"); piwikstatsdb.uploadTUDELFTStats(); logger.info("Processing TUDELFT Stats Done"); + logger.info("Processing B2SHARE Stats"); + piwikstatsdb.uploadB2SHAREStats(); + logger.info("Processing B2SHARE Stats Done"); } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml index 71e8a50d6..45a6abf3d 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris