From 1e06815cdb8e3cdc091cf1da42964f2fe2e25f5d Mon Sep 17 00:00:00 2001 From: Dimitris Date: Wed, 17 Feb 2021 09:46:38 +0200 Subject: [PATCH] Added Datasets from Datacite WF --- dhp-workflows/dhp-indicators/pom.xml | 107 ++++++++++++++++ dhp-workflows/dhp-indicators/runworkflow.sh | 1 + .../indicators/export/ExecuteWorkflow.java | 35 ++++++ .../indicators/oozie_app/config-default.xml | 38 ++++++ .../indicators/oozie_app/python/testpython.py | 5 + .../indicators/oozie_app/python/testscript.sh | 2 + .../graph/indicators/oozie_app/workflow.xml | 58 +++++++++ .../dhp-usage-datasets-stats-update/pom.xml | 4 +- .../datasetsusagestats/export/ConnectDB.java | 33 +---- .../export/DatasetsStatsDB.java | 46 ++----- .../export/ExecuteWorkflow.java | 12 +- .../export/ReadReportsListFromDatacite.java | 93 +++----------- .../export/UsageStatsExporter.java | 46 ------- .../datasets_usagestats_parameters.json | 114 +++++++++--------- .../datasetsusagestats/oozie_app/workflow.xml | 4 +- 15 files changed, 335 insertions(+), 263 deletions(-) create mode 100644 dhp-workflows/dhp-indicators/pom.xml create mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml mode change 100755 => 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml new file mode 100644 index 000000000..937795791 --- /dev/null +++ b/dhp-workflows/dhp-indicators/pom.xml @@ -0,0 +1,107 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-indicators + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-indicators + diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh new file mode 100755 index 000000000..0cad5792d --- /dev/null +++ b/dhp-workflows/dhp-indicators/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java new file mode 100644 index 000000000..61e6ef72c --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java @@ -0,0 +1,35 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.indicators.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos + */ +public class ExecuteWorkflow { + + private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + logger.info("Workflow Executed"); + } + +} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py new file mode 100644 index 000000000..e913df6ae --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py @@ -0,0 +1,5 @@ +#! /usr/bin/env python +import sys + +print "this is a Python script" +print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh new file mode 100644 index 000000000..78938c85a --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml new file mode 100644 index 000000000..2b8ed7d99 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml @@ -0,0 +1,58 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + testpython.py + python/testpython.py + + + + + + + Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index c623a12f0..b39c3ff9b 100755 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -19,7 +19,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.1.7-SNAPSHOT ../ 4.0.0 @@ -96,7 +96,7 @@ eu.dnetlib.dhp dhp-common - 1.2.4-SNAPSHOT + 1.1.7-SNAPSHOT jar diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index de9e44fbf..cab0bc83f 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -9,10 +9,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; import org.apache.log4j.Logger; @@ -32,7 +28,6 @@ public abstract class ConnectDB { private static String dbHiveUrl; private static String dbImpalaUrl; private static String datasetUsageStatsDBSchema; - private static String datasetsUsageStatsPermanentDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); private Statement stmt = null; @@ -42,7 +37,6 @@ public abstract class ConnectDB { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; - datasetsUsageStatsPermanentDBSchema = ExecuteWorkflow.datasetsUsageStatsPermanentDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema; Class.forName("org.apache.hive.jdbc.HiveDriver"); @@ -69,25 +63,14 @@ public abstract class ConnectDB { } public static String getDataSetUsageStatsDBSchema() { - String datePattern = "YYYYMMdd"; - DateFormat df = new SimpleDateFormat(datePattern); -// Get the today date using Calendar object. - Date today = Calendar.getInstance().getTime(); - String todayAsString = df.format(today); - - return ConnectDB.datasetUsageStatsDBSchema + "_" + todayAsString; + return ConnectDB.datasetUsageStatsDBSchema; } public static String getStatsDBSchema() { return ConnectDB.statsDBSchema; } - public static String getDatasetsUsagestatsPermanentDBSchema() { - return ConnectDB.datasetsUsageStatsPermanentDBSchema; - } - private static Connection connectHive() throws SQLException { - logger.info("trying to open Hive connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); @@ -107,18 +90,14 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened HIVE successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); -// Connection connection = DriverManager.getConnection(dbHiveUrl); -// logger.debug("Opened Hive successfully"); -// -// return connection; } private static Connection connectImpala() throws SQLException { - logger.info("trying to open Impala connection..."); + ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); cpds.setUser("dimitris.pierrakos"); @@ -137,12 +116,8 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened Impala successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); -// Connection connection = DriverManager.getConnection(dbHiveUrl); -// logger.debug("Opened Impala successfully"); -// -// return connection; } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java index baffa39e0..17661b99e 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -1,6 +1,8 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; +import java.sql.Connection; +import java.sql.SQLException; import java.sql.Statement; import org.slf4j.Logger; @@ -45,7 +47,7 @@ public class DatasetsStatsDB { try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating datacite usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -53,23 +55,6 @@ public class DatasetsStatsDB { logger.error("Failed to create database: " + e); throw new Exception("Failed to create database: " + e.toString(), e); } - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger - .info( - "Creating permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); - String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " - + ConnectDB.getDatasetsUsagestatsPermanentDBSchema(); - stmt.executeUpdate(createPermanentDatabase); - logger - .info( - "Created permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } } private void createTables() throws Exception { @@ -77,10 +62,10 @@ public class DatasetsStatsDB { stmt = ConnectDB.getHiveConnection().createStatement(); // Create Reports table - This table should exist - logger.info("Creating Reports Tmp Table"); + logger.info("Creating Reports Table"); String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp(reportid STRING, \n" + + ".datacitereports(reportid STRING, \n" + " name STRING, \n" + " source STRING,\n" + " release STRING,\n" @@ -94,10 +79,10 @@ public class DatasetsStatsDB { logger.info("Reports Table Created"); // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Tmp Table"); + logger.info("Creating DataSetsPerformance Table"); String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp(ds_type STRING,\n" + + ".datasetsperformance(ds_type STRING,\n" + " ds_title STRING,\n" + " yop STRING,\n" + " dataset_type STRING, \n" @@ -115,22 +100,7 @@ public class DatasetsStatsDB { + " CLUSTERED BY (ds_type)\n" + " into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Tmp Table Created"); - - logger.info("Creating Datacite Reports table"); - String createDataciteReportsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp STORED AS PARQUET"; - stmt.executeUpdate(createDataciteReportsTable); - logger.info("Datacite Reports Table created"); - - logger.info("Creating Datasets Performance table"); - String createDatasetPerformanceTable = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp STORED AS PARQUET"; - stmt.executeUpdate(createDatasetPerformanceTable); - logger.info("DatasetsPerformance Table created"); + logger.info("DataSetsPerformance Table Created"); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index ffa8b8199..b28578e4b 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -21,7 +21,6 @@ public class ExecuteWorkflow { static String dbHiveUrl; static String dbImpalaUrl; static String datasetUsageStatsDBSchema; - static String datasetsUsageStatsPermanentDBSchema; static String statsDBSchema; static boolean recreateDbAndTables; static boolean datasetsEmptyDirs; @@ -46,7 +45,6 @@ public class ExecuteWorkflow { dbHiveUrl = parser.get("dbHiveUrl"); dbImpalaUrl = parser.get("dbImpalaUrl"); datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); - datasetsUsageStatsPermanentDBSchema = parser.get("datasetsUsageStatsPermanentDBSchema"); statsDBSchema = parser.get("statsDBSchema"); if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) @@ -59,11 +57,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - +// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) +// finalTablesVisibleToImpala = true; +// else +// finalTablesVisibleToImpala = false; +// UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index e89e2e5a4..6e8c0e397 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -65,7 +65,7 @@ public class ReadReportsListFromDatacite { logger.info("Checking report with id " + reportID); String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp where reportid=?"; + + ".datacitereports where reportid=?"; PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); stGetReportID.setString(1, reportID); @@ -76,7 +76,7 @@ public class ReadReportsListFromDatacite { dropTmpReportsTable(); } else { String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports_tmp " + + " .datacitereports " + "SELECT\n" + " get_json_object(json, '$.report.id') AS reportid,\n" + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" @@ -105,7 +105,7 @@ public class ReadReportsListFromDatacite { ResultSet rstmpReportAll = stmt.getResultSet(); if (rstmpReportAll.next()) { String listDatasets = rstmpReportAll.getString(1); - logger.info("Adding uncompressed performance for " + reportID); + logger.info("No compressed performance found"); this.readDatasetsReport(listDatasets, reportID); } @@ -125,9 +125,6 @@ public class ReadReportsListFromDatacite { } public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { - logger.info("Reading Datasets performance for report " + reportId); - logger.info("Write Performance Report To File"); - ConnectDB.getHiveConnection().setAutoCommit(false); ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -154,7 +151,8 @@ public class ReadReportsListFromDatacite { fin.writeChar('\n'); fin.close(); - logger.info("Reading Performance Report From File..."); + logger.info("Write Compress Report To File"); + logger.info("Reading Compress Report From File..."); String sqlCreateTempTableForDatasets = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed (report_datasets array>,dataset_title:string, data_type:string, " @@ -177,7 +175,7 @@ public class ReadReportsListFromDatacite { stmt.execute(sqlCreateTempTableForDatasets); String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp SELECT dataset.dataset_id[0].value ds_type, " + + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + " dataset.dataset_title ds_title, " + " dataset.yop yop, " + " dataset.data_type dataset_type, " @@ -198,7 +196,7 @@ public class ReadReportsListFromDatacite { stmt.executeUpdate(sqlInsertToDatasetsPerformance); - logger.info("Datasets Performance Inserted for Report " + reportId); + logger.info("Datasets Performance Inserted "); stmt.execute("Drop table " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed"); @@ -296,93 +294,32 @@ public class ReadReportsListFromDatacite { } public void createUsageStatisticsTable() throws SQLException { + logger.info("Dropping Downloads Stats table"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Updating Datacite Reports table"); - String createDataciteReportsTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports " - + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports_tmp"; - stmt.executeUpdate(createDataciteReportsTable); - logger.info("Datacite Reports Table updated"); - - logger.info("Updating Datasets Performance table"); - String createDatasetPerformanceTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance " - + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance_tmp"; - stmt.executeUpdate(createDatasetPerformanceTable); - logger.info("DatasetsPerformance Table updated"); + String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_downloads"; + stmt.executeUpdate(dropDownloadsTable); logger.info("Creating Downloads Stats table"); String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads STORED AS PARQUET as " + + ".datacite_downloads as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-requests' "; + + "where metric_type='total-dataset-requests'"; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); logger.info("Creating Views Stats table"); - String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_views STORED AS PARQUET as " + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-investigations' "; + + "where metric_type='total-dataset-investigations'"; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); - - logger.info("Building Permanent Datasets Usage Stats DB"); - - logger.info("Dropping view datacitereports on permanent datacite usagestats DB"); - String sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacitereports on permanent datacite usagestats DB"); - - logger.info("Create view datacitereports on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - logger.info("Created view datacitereports on permanent datasets usagestats DB"); - - logger.info("Dropping view datasetsperformance on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - logger.info("Dropped view datasetsperformance on permanent datacite usagestats DB"); - - logger.info("Create view datasetsperformance on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - logger.info("Created view datasetsperformance on permanent datasets usagestats DB"); - - logger.info("Dropping view datacite_views on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacite_views on permanent datacite usagestats DB"); - - logger.info("Create view datacite_views on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - logger.info("Created view datacite_views on permanent datasets usagestats DB"); - - logger.info("Dropping view datacite_downloads on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacite_downloads on permanent datacite usagestats DB"); - - logger.info("Create view datacite_downloads on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - logger.info("Created view datacite_downloads on permanent datasets usagestats DB"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Completed Building Permanent Datasets Usage Stats DB"); } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java old mode 100755 new mode 100644 index 8d6e24333..d96d7e875 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -2,7 +2,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; -import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; @@ -68,50 +67,5 @@ public class UsageStatsExporter { readReportsListFromDatacite.readReports(); logger.info("Reports Stored To DB"); readReportsListFromDatacite.createUsageStatisticsTable(); - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - } - - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - - stmt.close(); - try { - ConnectDB.getHiveConnection().close(); - } catch (Exception e) { - logger.info("Message at the end :" + e.getMessage()); - } } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json index f67651627..f8d51a882 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -1,62 +1,56 @@ [ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "uspdbs", - "paramLongName": "datasetsUsageStatsPermanentDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml index 22bf22c01..36c1ccea5 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -52,8 +52,6 @@ ${impalaJdbcUrl} --datasetUsageStatsDBSchema ${datasetUsageStatsDBSchema} - --datasetsUsageStatsPermanentDBSchema - ${datasetsUsageStatsPermanentDBSchema} --statsDBSchema ${statsDBSchema} --recreateDbAndTables