From b081e01dad38d654a3cd387643aa96784689a4ee Mon Sep 17 00:00:00 2001 From: Dimitris Date: Wed, 17 Feb 2021 09:46:38 +0200 Subject: [PATCH 01/12] Added Datasets from Datacite WF --- dhp-workflows/dhp-indicators/pom.xml | 107 ++++++ dhp-workflows/dhp-indicators/runworkflow.sh | 1 + .../indicators/export/ExecuteWorkflow.java | 35 ++ .../indicators/oozie_app/config-default.xml | 38 ++ .../indicators/oozie_app/python/testpython.py | 5 + .../indicators/oozie_app/python/testscript.sh | 2 + .../graph/indicators/oozie_app/workflow.xml | 58 ++++ .../nb-configuration.xml | 18 + .../dhp-usage-datasets-stats-update/pom.xml | 121 +++++++ .../runworkflow.sh | 1 + .../datasetsusagestats/export/ConnectDB.java | 123 +++++++ .../export/DatasetsStatsDB.java | 114 ++++++ .../DownloadReportsListFromDatacite.java | 100 ++++++ .../export/ExecuteWorkflow.java | 69 ++++ .../export/ReadReportsListFromDatacite.java | 325 ++++++++++++++++++ .../export/UsageStatsExporter.java | 71 ++++ .../datasets_usagestats_parameters.json | 56 +++ .../oozie_app/config-default.xml | 38 ++ .../datasetsusagestats/oozie_app/workflow.xml | 70 ++++ .../usagestatsbuild/export/PiwikStatsDB.java | 6 +- nbactions.xml | 15 + 21 files changed, 1370 insertions(+), 3 deletions(-) create mode 100644 dhp-workflows/dhp-indicators/pom.xml create mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml create mode 100755 dhp-workflows/dhp-usage-datasets-stats-update/pom.xml create mode 100755 dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml create mode 100644 nbactions.xml diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml new file mode 100644 index 000000000..937795791 --- /dev/null +++ b/dhp-workflows/dhp-indicators/pom.xml @@ -0,0 +1,107 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-indicators + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-indicators + diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh new file mode 100755 index 000000000..0cad5792d --- /dev/null +++ b/dhp-workflows/dhp-indicators/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java new file mode 100644 index 000000000..61e6ef72c --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java @@ -0,0 +1,35 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.indicators.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos + */ +public class ExecuteWorkflow { + + private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + logger.info("Workflow Executed"); + } + +} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py new file mode 100644 index 000000000..e913df6ae --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py @@ -0,0 +1,5 @@ +#! /usr/bin/env python +import sys + +print "this is a Python script" +print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh new file mode 100644 index 000000000..78938c85a --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml new file mode 100644 index 000000000..2b8ed7d99 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml @@ -0,0 +1,58 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + testpython.py + python/testpython.py + + + + + + + Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml new file mode 100644 index 000000000..a65c4514a --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml @@ -0,0 +1,18 @@ + + + + + + JDK_1.8 + + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml new file mode 100755 index 000000000..b39c3ff9b --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -0,0 +1,121 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + ../ + + 4.0.0 + dhp-usage-datasets-stats-update + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + 2.7.4 + jar + + + eu.dnetlib.dhp + dhp-common + 1.1.7-SNAPSHOT + jar + + + com.mchange + c3p0 + 0.9.5.2 + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + org.slf4j + slf4j-api + 1.7.26 + jar + + + dhp-usage-datasets-stats-update + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh new file mode 100755 index 000000000..9b4325508 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java new file mode 100644 index 000000000..cab0bc83f --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -0,0 +1,123 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos + */ +/** + * @author D. Pierrakos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String datasetUsageStatsDBSchema; + private static String statsDBSchema; + private final static Logger logger = Logger.getLogger(ConnectDB.class); + private Statement stmt = null; + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getDataSetUsageStatsDBSchema() { + return ConnectDB.datasetUsageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setUser("dimitris.pierrakos"); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + logger.info("Opened database successfully"); + + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setUser("dimitris.pierrakos"); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + logger.info("Opened database successfully"); + return cpds.getConnection(); + + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java new file mode 100644 index 000000000..17661b99e --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -0,0 +1,114 @@ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos + */ +public class DatasetsStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class); + + public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Reports table - This table should exist + logger.info("Creating Reports Table"); + String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports(reportid STRING, \n" + + " name STRING, \n" + + " source STRING,\n" + + " release STRING,\n" + + " createdby STRING,\n" + + " report_start_date STRING,\n" + + " report_end_date STRING)\n" + + " CLUSTERED BY (reportid)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + + stmt.executeUpdate(sqlCreateTableDataciteReports); + logger.info("Reports Table Created"); + + // Create Datasets Performance Table + logger.info("Creating DataSetsPerformance Table"); + String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance(ds_type STRING,\n" + + " ds_title STRING,\n" + + " yop STRING,\n" + + " dataset_type STRING, \n" + + " uri STRING,\n" + + " platform STRING,\n" + + " publisher STRING,\n" + + " publisher_id array>,\n" + + " dataset_contributors array>,\n" + + " period_end STRING,\n" + + " period_from STRING,\n" + + " access_method STRING,\n" + + " metric_type STRING,\n" + + " count INT,\n" + + " reportid STRING)\n" + + " CLUSTERED BY (ds_type)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableDataSetsPerformance); + logger.info("DataSetsPerformance Table Created"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java new file mode 100644 index 000000000..02754e173 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java @@ -0,0 +1,100 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +/** + * @author D.Pierrakos + */ +public class DownloadReportsListFromDatacite { + + private String dataciteBaseURL; + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) + throws MalformedURLException, Exception { + + this.dataciteBaseURL = dataciteBaseURL; + this.dataciteReportPath = dataciteReportPath; + } + + public void downloadReportsList() throws ParseException { + StringBuilder responseStrBuilder = new StringBuilder(); + + Gson gson = new Gson(); + + try { + BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder.append(inputStr); + } + } catch (IOException e) { + logger.info(e.getMessage()); + } + JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); + JsonArray dataArray = jsonObject.getAsJsonArray("reports"); + ArrayList reportsList = new ArrayList(); + for (JsonElement element : dataArray) { + reportsList.add(element.getAsJsonObject().get("id").getAsString()); + } + + Iterator it = reportsList.iterator(); + while (it.hasNext()) { + String reportId = it.next().toString(); + String url = dataciteBaseURL + reportId; + + try { + BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + StringBuilder responseStrBuilder2 = new StringBuilder(); + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder2.append(inputStr); + } + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(dataciteReportPath + "/" + reportId + ".json"), + true); + byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + fin.close(); + + fin.close(); + } catch (IOException e) { + System.out.println(e); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java new file mode 100644 index 000000000..b28578e4b --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -0,0 +1,69 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String dataciteBaseURL; + static String dataciteReportPath; + static String dbHiveUrl; + static String dbImpalaUrl; + static String datasetUsageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + static boolean datasetsEmptyDirs; + static boolean finalTablesVisibleToImpala; + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + dataciteBaseURL = parser.get("dataciteBaseURL"); + dataciteReportPath = parser.get("dataciteReportPath"); + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + recreateDbAndTables = true; + else + recreateDbAndTables = false; + + if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true")) + datasetsEmptyDirs = true; + else + datasetsEmptyDirs = false; + +// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) +// finalTablesVisibleToImpala = true; +// else +// finalTablesVisibleToImpala = false; +// + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java new file mode 100644 index 000000000..6e8c0e397 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -0,0 +1,325 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.*; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.Base64; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * @author D.Pierrakos + */ +public class ReadReportsListFromDatacite { + + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception { + + this.dataciteReportPath = dataciteReportPath; + } + + public void readReports() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + ArrayList jsonFiles = listHdfsDir(dataciteReportPath); + for (String jsonFile : jsonFiles) { + logger.info("Reading report file " + jsonFile); + this.createTmpReportsTable(jsonFile); + + String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(sqlSelectReportID); + ResultSet rstmpReportID = stmt.getResultSet(); + + String reportID = null; + while (rstmpReportID.next()) { + reportID = rstmpReportID.getString(1); + } + + logger.info("Checking report with id " + reportID); + String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports where reportid=?"; + PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); + stGetReportID.setString(1, reportID); + + ResultSet rsCheckIfReportExist = stGetReportID.executeQuery(); + + if (rsCheckIfReportExist.next()) { + logger.info("Report found with ID " + reportID); + dropTmpReportsTable(); + } else { + String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datacitereports " + + "SELECT\n" + + " get_json_object(json, '$.report.id') AS reportid,\n" + + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" + + " get_json_object(json, '$.report.report-header.report-id') AS source,\n" + + " get_json_object(json, '$.report.report-header.release') AS release,\n" + + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(sqlInsertReport); + + logger.info("Report added"); + + logger.info("Adding datasets"); + String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(sqlSelecteDatasetsArray); + ResultSet rstmpReportDatasets = stmt.getResultSet(); + + if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) { + // String[] listDatasets = rstmpReportDatasets.getString(1).split(","); + // String listDatasets = rstmpReportDatasets.getString(1); + String sqlSelectReport = "SELECT * FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(sqlSelectReport); + ResultSet rstmpReportAll = stmt.getResultSet(); + if (rstmpReportAll.next()) { + String listDatasets = rstmpReportAll.getString(1); + logger.info("No compressed performance found"); + this.readDatasetsReport(listDatasets, reportID); + } + + } + logger.info("Adding gziped performance for datasets"); + String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(sqlSelecteReportSubsets); + ResultSet rstmpReportSubsets = stmt.getResultSet(); + if (rstmpReportSubsets.next()) { + String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1)); + this.readDatasetsReport(unCompressedReport, reportID); + } + } + } + this.dropTmpReportsTable(); + } + + public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { + ObjectMapper objectMapper = new ObjectMapper(); + JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); + String datasetsReports = jsonNode.toString(); + String report = datasetsReports + .replace("report-datasets", "report_datasets") + .replace("dataset-title", "dataset_title") + .replace("dataset-id", "dataset_id") + .replace("data-type", "data_type") + .replace("publisher-id", "publisher_id") + .replace("dataset-contributors", "dataset_contributors") + .replace("begin-date", "begin_date") + .replace("end-date", "end_date") + .replace("access-method", "access_method") + .replace("metric-type", "metric_type") + .replace("doi:", ""); + FileSystem fs = FileSystem.get(new Configuration()); + String tmpPath = dataciteReportPath + "/tmpjson"; + FSDataOutputStream fin = fs + .create(new Path(dataciteReportPath + "/tmpjson/" + reportId + "_Compressed.json"), true); + byte[] jsonObjectRawBytes = report.getBytes(); + + fin.write(jsonObjectRawBytes); + + fin.writeChar('\n'); + fin.close(); + + logger.info("Write Compress Report To File"); + logger.info("Reading Compress Report From File..."); + + String sqlCreateTempTableForDatasets = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjsoncompressesed (report_datasets array>,dataset_title:string, data_type:string, " + + "uri:string, publisher:string, publisher_id:array>,platform:string, yop:string, " + + "dataset_contributors:array>," + + "performance:array, " + + "instance:array>>>>>) " + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + tmpPath + "'"; + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Inserting Datasets Performance"); + stmt.execute(sqlCreateTempTableForDatasets); + + String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + + " dataset.dataset_title ds_title, " + + " dataset.yop yop, " + + " dataset.data_type dataset_type, " + + " dataset.uri uri, " + + " dataset.platform platform, " + + " dataset.publisher publisher, " + + " dataset.publisher_id publisher_id, " + + " dataset.dataset_contributors dataset_contributors, " + + " period.end_date period_end, " + + " period.begin_date period_from, " + + " performance.access_method access_method, " + + " performance.metric_type metric_type, " + + " performance.count count, " + + "'" + reportId + "' report_id " + + " FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed " + + " LATERAL VIEW explode(report_datasets) exploded_table as dataset LATERAL VIEW explode(dataset.performance[0].instance) exploded_table2 as performance " + + " LATERAL VIEW explode (array(dataset.performance[0].period)) exploded_table3 as period"; + + stmt.executeUpdate(sqlInsertToDatasetsPerformance); + + logger.info("Datasets Performance Inserted "); + + stmt.execute("Drop table " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed"); + + logger.info("Datasets Report Added"); + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir)); + throw new Exception("HDFS file path with exported data does not exist : " + dir, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + sb.append(line); + // sb.append(line); + line = br.readLine(); + } + // uncompressedReport = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + result = sb.toString().trim(); + // fs.close(); + } catch (Exception e) { + throw new Exception(e); + } + + return result; + } + + public static String uncompressString(String zippedBase64Str) + throws IOException { + String uncompressedReport = null; + + byte[] bytes = Base64.getDecoder().decode(zippedBase64Str); + GZIPInputStream zi = null; + try { + zi = new GZIPInputStream(new ByteArrayInputStream(bytes)); + uncompressedReport = IOUtils.toString(zi); + } finally { + IOUtils.closeQuietly(zi); + } + logger.info("Report Succesfully Uncompressed..."); + return uncompressedReport; + } + + private void createTmpReportsTable(String jsonFile) throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + dropTmpReportsTable(); + String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjsonToTable (json STRING)"; + stmt.executeUpdate(createTmpTable); + logger.info("Temporary Table for Json Report Created"); + + String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + stmt.execute(insertJsonReport); + logger.info("JSON Report File inserted to tmpjsonToTable Table"); + } + + private void dropTmpReportsTable() throws SQLException { + logger.info("Dropping tmpjson Table"); + String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsonToTable"; + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + stmt.executeUpdate(dropTmpTable); + logger.info("Dropped Table for Json Report Table"); + + } + + public void createUsageStatisticsTable() throws SQLException { + logger.info("Dropping Downloads Stats table"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_downloads"; + stmt.executeUpdate(dropDownloadsTable); + + logger.info("Creating Downloads Stats table"); + String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_downloads as " + + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " + + "where metric_type='total-dataset-requests'"; + stmt.executeUpdate(createDownloadsTable); + logger.info("Downloads Stats table created"); + + logger.info("Creating Views Stats table"); + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " + + "where metric_type='total-dataset-investigations'"; + stmt.executeUpdate(createViewsTable); + logger.info("Views Stats table created"); + } + +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java new file mode 100644 index 000000000..d96d7e875 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.IOException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + private Statement stmt = null; + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); + + logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); + + logger.info("Creating tmp directory: " + ExecuteWorkflow.dataciteReportPath + " " + "/tmpjson/"); + dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath + "/tmpjson/")); + + } + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + ConnectDB.getHiveConnection(); + + if (ExecuteWorkflow.recreateDbAndTables) { + DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", ""); + datasetsDB.recreateDBAndTables(); + } + logger.info("Initializing the download logs module"); + DownloadReportsListFromDatacite downloadReportsListFromDatacite = new DownloadReportsListFromDatacite( + ExecuteWorkflow.dataciteBaseURL, + ExecuteWorkflow.dataciteReportPath); + + if (ExecuteWorkflow.datasetsEmptyDirs) { + logger.info("Downloading Reports List From Datacite"); + this.reCreateLogDirs(); + downloadReportsListFromDatacite.downloadReportsList(); + logger.info("Reports List has been downloaded"); + } + + ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite( + ExecuteWorkflow.dataciteReportPath); + logger.info("Store Reports To DB"); + readReportsListFromDatacite.readReports(); + logger.info("Reports Stored To DB"); + readReportsListFromDatacite.createUsageStatisticsTable(); + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json new file mode 100644 index 000000000..f8d51a882 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -0,0 +1,56 @@ +[ + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml new file mode 100644 index 000000000..36c1ccea5 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -0,0 +1,70 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow + --dataciteBaseURL + ${dataciteBaseURL} + --dataciteReportPath + ${dataciteReportPath} + --dbHiveUrl + ${hiveJdbcUrl} + --dbImpalaUrl + ${impalaJdbcUrl} + --datasetUsageStatsDBSchema + ${datasetUsageStatsDBSchema} + --statsDBSchema + ${statsDBSchema} + --recreateDbAndTables + ${recreateDbAndTables} + --datasetsEmptyDirs + ${datasetsEmptyDirs} + --finalTablesVisibleToImpala + ${finalTablesVisibleToImpala} + + + + + + + + diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index 253dc03b5..5a6953f4c 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -132,7 +132,7 @@ public class PiwikStatsDB { + "max(views) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_views_stats); @@ -145,7 +145,7 @@ public class PiwikStatsDB { + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID - + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' " + + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_pageviews_stats); @@ -194,7 +194,7 @@ public class PiwikStatsDB { + "max(downloads) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' " + + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' AND d.id!='re3data_____::7b0ad08687b2c960d5aeef06f811d5e6' " + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(sql); diff --git a/nbactions.xml b/nbactions.xml new file mode 100644 index 000000000..4b6f7519d --- /dev/null +++ b/nbactions.xml @@ -0,0 +1,15 @@ + + + + test + + * + + + test + + + true + + + -- 2.17.1 From ac5b7f89b5b71f098bd63ec31eadd3afb5ce5e15 Mon Sep 17 00:00:00 2001 From: Dimitris Date: Wed, 17 Feb 2021 14:56:13 +0200 Subject: [PATCH 02/12] Changes to logger.info --- .gitignore | 3 +-- .../export/ReadReportsListFromDatacite.java | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2d7730711..f5d6c2bc0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,5 +23,4 @@ /build spark-warehouse /**/job-override.properties -/**/*.log - +/**/*.log \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index 6e8c0e397..1b769bf53 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -105,7 +105,7 @@ public class ReadReportsListFromDatacite { ResultSet rstmpReportAll = stmt.getResultSet(); if (rstmpReportAll.next()) { String listDatasets = rstmpReportAll.getString(1); - logger.info("No compressed performance found"); + logger.info("Adding uncompressed performance for " + reportID); this.readDatasetsReport(listDatasets, reportID); } @@ -125,6 +125,9 @@ public class ReadReportsListFromDatacite { } public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { + logger.info("Reading Datasets performance for report " + reportId); + logger.info("Write Performance Report To File"); + ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -151,8 +154,7 @@ public class ReadReportsListFromDatacite { fin.writeChar('\n'); fin.close(); - logger.info("Write Compress Report To File"); - logger.info("Reading Compress Report From File..."); + logger.info("Reading Performance Report From File..."); String sqlCreateTempTableForDatasets = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed (report_datasets array>,dataset_title:string, data_type:string, " @@ -196,7 +198,7 @@ public class ReadReportsListFromDatacite { stmt.executeUpdate(sqlInsertToDatasetsPerformance); - logger.info("Datasets Performance Inserted "); + logger.info("Datasets Performance Inserted for Report " + reportId); stmt.execute("Drop table " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed"); -- 2.17.1 From 533bde32283945d74b41a5f53c0ac9a77c5668b9 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 4 Jun 2021 15:47:32 +0300 Subject: [PATCH 03/12] Commit datasets changes --- dhp-workflows/dhp-indicators/pom.xml | 107 ------------- dhp-workflows/dhp-indicators/runworkflow.sh | 1 - .../indicators/export/ExecuteWorkflow.java | 35 ----- .../indicators/oozie_app/config-default.xml | 38 ----- .../indicators/oozie_app/python/testpython.py | 5 - .../indicators/oozie_app/python/testscript.sh | 2 - .../graph/indicators/oozie_app/workflow.xml | 58 ------- .../dhp-usage-datasets-stats-update/pom.xml | 4 +- .../datasetsusagestats/export/ConnectDB.java | 33 +++- .../export/DatasetsStatsDB.java | 46 +++++- .../export/ExecuteWorkflow.java | 12 +- .../export/ReadReportsListFromDatacite.java | 85 +++++++++-- .../export/UsageStatsExporter.java | 46 ++++++ .../datasets_usagestats_parameters.json | 114 +++++++------- .../datasetsusagestats/oozie_app/workflow.xml | 4 +- .../usagerawdata/export/ExecuteWorkflow.java | 4 + .../export/PiwikDownloadLogs.java | 2 +- .../usagerawdata/export/PiwikStatsDB.java | 43 ++++++ .../export/UsageStatsExporter.java | 15 +- .../export/usagerawdata_parameters.json | 6 + .../graph/usagerawdata/oozie_app/workflow.xml | 3 +- .../usagestatsbuild/export/ConnectDB.java | 2 +- .../usagestatsbuild/export/PiwikStatsDB.java | 143 ++++++++++++++++-- .../export/UsageStatsExporter.java | 3 + .../usagestatsbuild/oozie_app/workflow.xml | 2 +- 25 files changed, 463 insertions(+), 350 deletions(-) delete mode 100644 dhp-workflows/dhp-indicators/pom.xml delete mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml mode change 100644 => 100755 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml deleted file mode 100644 index 937795791..000000000 --- a/dhp-workflows/dhp-indicators/pom.xml +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-indicators - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-indicators - diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh deleted file mode 100755 index 0cad5792d..000000000 --- a/dhp-workflows/dhp-indicators/runworkflow.sh +++ /dev/null @@ -1 +0,0 @@ -mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java deleted file mode 100644 index 61e6ef72c..000000000 --- a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.indicators.export; - -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos - */ -public class ExecuteWorkflow { - - private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - logger.info("Workflow Executed"); - } - -} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py deleted file mode 100644 index e913df6ae..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py +++ /dev/null @@ -1,5 +0,0 @@ -#! /usr/bin/env python -import sys - -print "this is a Python script" -print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh deleted file mode 100644 index 78938c85a..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml deleted file mode 100644 index 2b8ed7d99..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - ${jobTracker} - ${nameNode} - - - mapred.job.queue.name - ${queueName} - - - testpython.py - python/testpython.py - - - - - - - Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index b39c3ff9b..c623a12f0 100755 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -19,7 +19,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT ../ 4.0.0 @@ -96,7 +96,7 @@ eu.dnetlib.dhp dhp-common - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT jar diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index cab0bc83f..de9e44fbf 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -9,6 +9,10 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; import org.apache.log4j.Logger; @@ -28,6 +32,7 @@ public abstract class ConnectDB { private static String dbHiveUrl; private static String dbImpalaUrl; private static String datasetUsageStatsDBSchema; + private static String datasetsUsageStatsPermanentDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); private Statement stmt = null; @@ -37,6 +42,7 @@ public abstract class ConnectDB { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; + datasetsUsageStatsPermanentDBSchema = ExecuteWorkflow.datasetsUsageStatsPermanentDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema; Class.forName("org.apache.hive.jdbc.HiveDriver"); @@ -63,14 +69,25 @@ public abstract class ConnectDB { } public static String getDataSetUsageStatsDBSchema() { - return ConnectDB.datasetUsageStatsDBSchema; + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); + + return ConnectDB.datasetUsageStatsDBSchema + "_" + todayAsString; } public static String getStatsDBSchema() { return ConnectDB.statsDBSchema; } + public static String getDatasetsUsagestatsPermanentDBSchema() { + return ConnectDB.datasetsUsageStatsPermanentDBSchema; + } + private static Connection connectHive() throws SQLException { + logger.info("trying to open Hive connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); @@ -90,14 +107,18 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened HIVE successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Hive successfully"); +// +// return connection; } private static Connection connectImpala() throws SQLException { - + logger.info("trying to open Impala connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); cpds.setUser("dimitris.pierrakos"); @@ -116,8 +137,12 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened Impala successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Impala successfully"); +// +// return connection; } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java index 17661b99e..baffa39e0 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -1,8 +1,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; -import java.sql.Connection; -import java.sql.SQLException; import java.sql.Statement; import org.slf4j.Logger; @@ -47,7 +45,7 @@ public class DatasetsStatsDB { try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + logger.info("Creating datacite usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -55,6 +53,23 @@ public class DatasetsStatsDB { logger.error("Failed to create database: " + e); throw new Exception("Failed to create database: " + e.toString(), e); } + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger + .info( + "Creating permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getDatasetsUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger + .info( + "Created permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } } private void createTables() throws Exception { @@ -62,10 +77,10 @@ public class DatasetsStatsDB { stmt = ConnectDB.getHiveConnection().createStatement(); // Create Reports table - This table should exist - logger.info("Creating Reports Table"); + logger.info("Creating Reports Tmp Table"); String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" + + ".datacitereports_tmp(reportid STRING, \n" + " name STRING, \n" + " source STRING,\n" + " release STRING,\n" @@ -79,10 +94,10 @@ public class DatasetsStatsDB { logger.info("Reports Table Created"); // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Table"); + logger.info("Creating DataSetsPerformance Tmp Table"); String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance(ds_type STRING,\n" + + ".datasetsperformance_tmp(ds_type STRING,\n" + " ds_title STRING,\n" + " yop STRING,\n" + " dataset_type STRING, \n" @@ -100,7 +115,22 @@ public class DatasetsStatsDB { + " CLUSTERED BY (ds_type)\n" + " into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Table Created"); + logger.info("DataSetsPerformance Tmp Table Created"); + + logger.info("Creating Datacite Reports table"); + String createDataciteReportsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table created"); + + logger.info("Creating Datasets Performance table"); + String createDatasetPerformanceTable = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table created"); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index b28578e4b..ffa8b8199 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -21,6 +21,7 @@ public class ExecuteWorkflow { static String dbHiveUrl; static String dbImpalaUrl; static String datasetUsageStatsDBSchema; + static String datasetsUsageStatsPermanentDBSchema; static String statsDBSchema; static boolean recreateDbAndTables; static boolean datasetsEmptyDirs; @@ -45,6 +46,7 @@ public class ExecuteWorkflow { dbHiveUrl = parser.get("dbHiveUrl"); dbImpalaUrl = parser.get("dbImpalaUrl"); datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); + datasetsUsageStatsPermanentDBSchema = parser.get("datasetsUsageStatsPermanentDBSchema"); statsDBSchema = parser.get("statsDBSchema"); if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) @@ -57,11 +59,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; -// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) -// finalTablesVisibleToImpala = true; -// else -// finalTablesVisibleToImpala = false; -// + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else + finalTablesVisibleToImpala = false; + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index 1b769bf53..e89e2e5a4 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -65,7 +65,7 @@ public class ReadReportsListFromDatacite { logger.info("Checking report with id " + reportID); String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports where reportid=?"; + + ".datacitereports_tmp where reportid=?"; PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); stGetReportID.setString(1, reportID); @@ -76,7 +76,7 @@ public class ReadReportsListFromDatacite { dropTmpReportsTable(); } else { String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports " + + " .datacitereports_tmp " + "SELECT\n" + " get_json_object(json, '$.report.id') AS reportid,\n" + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" @@ -127,7 +127,7 @@ public class ReadReportsListFromDatacite { public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { logger.info("Reading Datasets performance for report " + reportId); logger.info("Write Performance Report To File"); - + ConnectDB.getHiveConnection().setAutoCommit(false); ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -177,7 +177,7 @@ public class ReadReportsListFromDatacite { stmt.execute(sqlCreateTempTableForDatasets); String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + + ".datasetsperformance_tmp SELECT dataset.dataset_id[0].value ds_type, " + " dataset.dataset_title ds_title, " + " dataset.yop yop, " + " dataset.data_type dataset_type, " @@ -296,32 +296,93 @@ public class ReadReportsListFromDatacite { } public void createUsageStatisticsTable() throws SQLException { - logger.info("Dropping Downloads Stats table"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads"; - stmt.executeUpdate(dropDownloadsTable); + + logger.info("Updating Datacite Reports table"); + String createDataciteReportsTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports_tmp"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table updated"); + + logger.info("Updating Datasets Performance table"); + String createDatasetPerformanceTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance_tmp"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table updated"); logger.info("Creating Downloads Stats table"); String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads as " + + ".datacite_downloads STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-requests'"; + + "where metric_type='total-dataset-requests' "; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); logger.info("Creating Views Stats table"); - String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_views STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-investigations'"; + + "where metric_type='total-dataset-investigations' "; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); + + logger.info("Building Permanent Datasets Usage Stats DB"); + + logger.info("Dropping view datacitereports on permanent datacite usagestats DB"); + String sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacitereports on permanent datacite usagestats DB"); + + logger.info("Create view datacitereports on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Created view datacitereports on permanent datasets usagestats DB"); + + logger.info("Dropping view datasetsperformance on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Dropped view datasetsperformance on permanent datacite usagestats DB"); + + logger.info("Create view datasetsperformance on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Created view datasetsperformance on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_views on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_views on permanent datacite usagestats DB"); + + logger.info("Create view datacite_views on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_views on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_downloads on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_downloads on permanent datacite usagestats DB"); + + logger.info("Create view datacite_downloads on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_downloads on permanent datasets usagestats DB"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Completed Building Permanent Datasets Usage Stats DB"); } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java old mode 100644 new mode 100755 index d96d7e875..8d6e24333 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -2,6 +2,7 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; +import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; @@ -67,5 +68,50 @@ public class UsageStatsExporter { readReportsListFromDatacite.readReports(); logger.info("Reports Stored To DB"); readReportsListFromDatacite.createUsageStatisticsTable(); + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + stmt.close(); + try { + ConnectDB.getHiveConnection().close(); + } catch (Exception e) { + logger.info("Message at the end :" + e.getMessage()); + } } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json index f8d51a882..f67651627 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -1,56 +1,62 @@ [ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "datasetsUsageStatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml index 36c1ccea5..22bf22c01 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -52,6 +52,8 @@ ${impalaJdbcUrl} --datasetUsageStatsDBSchema ${datasetUsageStatsDBSchema} + --datasetsUsageStatsPermanentDBSchema + ${datasetsUsageStatsPermanentDBSchema} --statsDBSchema ${statsDBSchema} --recreateDbAndTables diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java index e0e0d3687..d2884a4bb 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -65,6 +65,8 @@ public class ExecuteWorkflow { static int numberOfDownloadThreads; + static int b2SSHAREID; + public static void main(String args[]) throws Exception { // Sending the logs to the console @@ -196,6 +198,8 @@ public class ExecuteWorkflow { numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + b2SSHAREID = Integer.parseInt(parser.get("b2shareID")); + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); // usagestatsExport.createdDBWithTablesOnly(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index a84d6743f..76412cd54 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -191,7 +191,7 @@ public class PiwikDownloadLogs { ResultSet rs = statement .executeQuery( "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + ".datasource where piwik_id is not null and piwik_id <> 0 and piwik_id <> 196 order by piwik_id"); // Getting all the piwikids in a list for logging reasons & limitting the list // to the max number of piwikids diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java index 9144620b7..00378ca1f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -179,6 +179,10 @@ public class PiwikStatsDB { createPedocsOldUsageData(); logger.info("Pedocs Tables Created"); + logger.info("Create Datacite Tables"); + createDatasetsUsageData(); + logger.info("Datacite Tables Created"); + } catch (Exception e) { logger.error("Failed to process logs: " + e); throw new Exception("Failed to process logs: " + e.toString(), e); @@ -281,6 +285,7 @@ public class PiwikStatsDB { // clean view double clicks logger.info("Cleaning action double clicks"); + ConnectDB.getHiveConnection().setAutoCommit(false); sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "WHERE EXISTS (\n" + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" @@ -750,6 +755,16 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("Dropped sarc_sushilogtmp_json_non_array"); + logger.info("Dropping piwiklogb2sharetmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogb2sharetmp"); + + logger.info("Dropping piwiklog_b2share_tmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog_b2share_tmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklog_b2share_tmp_json"); + stmt.close(); ConnectDB.getHiveConnection().close(); @@ -832,4 +847,32 @@ public class PiwikStatsDB { logger.info("PeDocs Old Downloads Table created"); } + + public void createDatasetsUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping datacite_views"); + String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Dropped datacite_views"); + + logger.info("Dropping datacite_downloads"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Dropped datacite_downloads"); + + logger.info("Creating Datasets Views Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_views as select * from openaire_prod_datacite_usage_stats.datacite_views"; + stmt.executeUpdate(sql); + logger.info("Datasets Views Table created"); + + logger.info("Creating Datasets Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_downloads as select * from openaire_prod_datacite_usage_stats.datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Datasets Downloads Table created"); + + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index 07e15605f..2f10e4d2b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -142,8 +142,20 @@ public class UsageStatsExporter { sarcStats.updateSarcLogs(); } logger.info("Sarc done"); - // finalize usagestats + PiwikDownloadLogs_B2SHARE b2sharePiwikID = new PiwikDownloadLogs_B2SHARE(ExecuteWorkflow.matomoBaseURL, + ExecuteWorkflow.matomoAuthToken); + b2sharePiwikID.GetOpenAIREB2SHARELogs(ExecuteWorkflow.repoLogPath); + logger.info("B2SHARE done"); + + PiwikStatsDB_B2SHARE piwikstatsB2SHAREdb = new PiwikStatsDB_B2SHARE(ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath); + piwikstatsB2SHAREdb.setCounterRobotsURL(cRobotsUrl); + + logger.info("Processing B2SHARE logs"); + piwikstatsB2SHAREdb.processB2SHARELogs(); + + // finalize usagestats logger.info("Dropping tmp tables"); if (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); @@ -161,6 +173,7 @@ public class UsageStatsExporter { piwikstatsdb.recreateDBAndTables(); piwikstatsdb.createPedocsOldUsageData(); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); logger.info("Creating LaReferencia tables"); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json index 1aa5ad6f8..8c733c55b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -215,5 +215,11 @@ "paramLongName": "numberOfDownloadThreads", "paramDescription": "Number of download threads", "paramRequired": true + }, + { + "paramName": "b2shareID", + "paramLongName": "b2shareID", + "paramDescription": "B2SHARE Matomo ID", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml index 022a107ab..80e1da478 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -78,6 +78,7 @@ --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} --numberOfDownloadThreads${numberOfDownloadThreads} + --b2shareID${b2shareID} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java index e53709f1a..ea07ed732 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -82,7 +82,7 @@ public abstract class ConnectDB { Date today = Calendar.getInstance().getTime(); String todayAsString = df.format(today); - return ConnectDB.usageStatsDBSchema + "_" + todayAsString; + return ConnectDB.usageStatsDBSchema + todayAsString; } public static String getStatsDBSchema() { diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index 5a6953f4c..7c6f28023 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -35,20 +35,20 @@ public class PiwikStatsDB { private void createDatabase() throws Exception { -// try { -// -// stmt = ConnectDB.getHiveConnection().createStatement(); -// -// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); -// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; -// stmt.executeUpdate(dropDatabase); -// } catch (Exception e) { -// logger.error("Failed to drop database: " + e); -// throw new Exception("Failed to drop database: " + e.toString(), e); -// } -// try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -337,6 +337,96 @@ public class PiwikStatsDB { } + public void uploadB2SHAREStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_views_monthly_tmp view"); + String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp"; + logger.info("Dropped b2share_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping b2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp"; + logger.info("Dropped b2share_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_views_stats_tmp table + logger.info("Dropping B2SHARE b2share_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp"; + logger.info("Dropped b2share_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_downloads_stats_tmp table + logger.info("Dropping B2SHARE b2share_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp"; + logger.info("Dropped b2share_downloads_stats_tmp table "); + stmt.executeUpdate(sql); + + // Creating B2SHARE b2share_result_views_monthly_tmp view + logger.info("Creating B2SHARE b2share_result_views_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=412 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created b2share_result_views_monthly_tmp view "); + + // Creating B2SHARE b2share_views_stats_tmp table + logger.info("Creating B2SHARE b2share_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp AS " + + "SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".b2share_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created B2SHARE b2share_views_stats_tmp table"); + + // Creating B2SHARE b2share_result_downloads_monthly_tmp view + logger.info("Creating B2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=412 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created b2share_result_downloads_monthly_tmp view "); + + // Creating B2SHARE b2share_downloads_stats_tmp table + logger.info("Creating B2SHARE b2share_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp AS " + + "SELECT 'B2SHARE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".b2share_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.id=ro.oid and d.id='re3data_____::ad3609c351bd520edf6f10f5e0d9b877' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created B2SHARE b2share_downloads_stats_tmp table"); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_views_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_views_monthly_tmp"; + logger.info("Dropped b2share_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping B2SHARE b2share_result_views_monthly_tmp view + logger.info("Dropping B2SHARE b2share_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".b2share_result_downloads_monthly_tmp"; + logger.info("Dropped b2share_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + } + public void finalizeStats() throws Exception { stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); @@ -402,6 +492,13 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("LaReferencia views updated to views_stats"); + // Inserting B2SHARE views stats + logger.info("Inserting B2SHARE data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("B2SHARE views updated to views_stats"); + logger.info("Creating downloads_stats table"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() @@ -425,12 +522,18 @@ public class PiwikStatsDB { logger.info("Inserted Pedocs data to downloads_stats"); // Inserting TUDELFT downloads stats - logger.info("Inserting TUDELFT old data to downloads_stats"); + logger.info("Inserting TUDELFT data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; stmt.executeUpdate(sql); logger.info("Inserted TUDELFT data to downloads_stats"); + // Inserting B2SHARE downloads stats + logger.info("Inserting B2SHARE data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".b2share_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted B2SHARE data to downloads_stats"); // Inserting Lareferencia downloads stats logger.info("Inserting LaReferencia data to downloads_stats"); sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " @@ -452,6 +555,20 @@ public class PiwikStatsDB { stmt.executeUpdate(sql); logger.info("SARC-OJS downloads updated to downloads_stats"); + // Inserting Datacite views stats + logger.info("Inserting Datacite views to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Datacite views updated to views_stats"); + + // Inserting Datacite downloads stats + logger.info("Inserting Datacite downloads to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageRawDataDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Datacite downloads updated to downloads_stats"); + logger.info("Creating pageviews_stats table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java index 47986f52a..0df6c8b2d 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -51,6 +51,9 @@ public class UsageStatsExporter { logger.info("Processing TUDELFT Stats"); piwikstatsdb.uploadTUDELFTStats(); logger.info("Processing TUDELFT Stats Done"); + logger.info("Processing B2SHARE Stats"); + piwikstatsdb.uploadB2SHAREStats(); + logger.info("Processing B2SHARE Stats Done"); } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml index 71e8a50d6..45a6abf3d 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris -- 2.17.1 From 4125b716613ae6da290755297f38060ba0f50aea Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 4 Jun 2021 15:49:51 +0300 Subject: [PATCH 04/12] Add B2Share --- .../export/PiwikDownloadLogs_B2SHARE.java | 204 ++++++++++++ .../export/PiwikStatsDB_B2SHARE.java | 304 ++++++++++++++++++ 2 files changed, 508 insertions(+) create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs_B2SHARE.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB_B2SHARE.java diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs_B2SHARE.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs_B2SHARE.java new file mode 100644 index 000000000..9ec6fb72e --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs_B2SHARE.java @@ -0,0 +1,204 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos + */ +public class PiwikDownloadLogs_B2SHARE { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs_B2SHARE.class); + + public PiwikDownloadLogs_B2SHARE(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + + } + + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } + + public void GetOpenAIREB2SHARELogs(String repoLogsPath) throws Exception { + + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + List piwikIdToVisit = new ArrayList(); + piwikIdToVisit.add(ExecuteWorkflow.b2SSHAREID); + logger.info("B2SHARE piwikId for download: " + piwikIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading for the followins piwikIds: " + piwikIdToVisit); + + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + // end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + logger.info("Now working on piwikId: " + siteId); + + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); + + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsB2SHAREForDate(currDay, siteId, repoLogsPath); + } + + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } + + public void GetOpenAIRELogsB2SHAREForDate(Calendar currDay, int siteId, String repoLogsPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = repoLogsPath; + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB_B2SHARE.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB_B2SHARE.java new file mode 100644 index 000000000..886079a23 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB_B2SHARE.java @@ -0,0 +1,304 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB_B2SHARE { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB_B2SHARE.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public PiwikStatsDB_B2SHARE(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + public void processB2SHARELogs() throws Exception { + try { + + logger.info("Processing B2SHARE logs"); + processLog(); + logger.info("B2SHARE logs process done"); + + logger.info("Removing double clicks from B2SHARE logs"); + removeDoubleClicks(); + logger.info("Removing double clicks from B2SHARE logs done"); + + logger.info("Updating Production Tables"); + updateProdTables(); + logger.info("Updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processLog() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping piwiklog_b2share_tmp_json table"); + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog_b2share_tmp_json"; + stmt.executeUpdate(drop_piwiklogtmp_json); + logger.info("Dropped piwiklog_b2share_tmp_json table"); + + logger.info("Creating piwiklog_b2share_tmp_json"); + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog_b2share_tmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " eventAction: STRING,\n" + + " eventName: STRING,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_piwiklogtmp_json); + logger.info("Created piwiklog_b2share_tmp_json"); + + logger.info("Dropping piwiklogtmp table"); + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; + stmt.executeUpdate(drop_piwiklogtmp); + logger.info("Dropped piwiklogtmp"); + + logger.info("Creating piwiklogb2sharetmp"); + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogb2sharetmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_piwiklogtmp); + logger.info("Created piwiklogb2sharetmp"); + + logger.info("Inserting into piwiklogb2sharetmp"); + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.eventAction as action, actiondetail.url as url, " + + "actiondetail.eventName as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog_b2share_tmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_piwiklogtmp); + logger.info("Inserted into piwiklogb2sharetmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp " + + "WHERE EXISTS (\n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp p2\n" + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } + + public void createPedocsOldUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating PeDocs Old Views Table"); + String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsoldviews as select * from default.pedocsviews"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Views Table created"); + + logger.info("Creating PeDocs Old Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsolddownloads as select * from default.pedocsdownloads"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Downloads Table created"); + + } + + public void createDatasetsUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating Datasets Views Table"); + String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_views as select * from datasetsusagestats_20210301.datacite_views"; + stmt.executeUpdate(sql); + logger.info("Datasets Views Table created"); + + logger.info("Creating Datasets Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".datacite_downloads as select * from datasetsusagestats_20210301.datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Datasets Downloads Table created"); + + } +} -- 2.17.1 From 1e06815cdb8e3cdc091cf1da42964f2fe2e25f5d Mon Sep 17 00:00:00 2001 From: Dimitris Date: Wed, 17 Feb 2021 09:46:38 +0200 Subject: [PATCH 05/12] Added Datasets from Datacite WF --- dhp-workflows/dhp-indicators/pom.xml | 107 ++++++++++++++++ dhp-workflows/dhp-indicators/runworkflow.sh | 1 + .../indicators/export/ExecuteWorkflow.java | 35 ++++++ .../indicators/oozie_app/config-default.xml | 38 ++++++ .../indicators/oozie_app/python/testpython.py | 5 + .../indicators/oozie_app/python/testscript.sh | 2 + .../graph/indicators/oozie_app/workflow.xml | 58 +++++++++ .../dhp-usage-datasets-stats-update/pom.xml | 4 +- .../datasetsusagestats/export/ConnectDB.java | 33 +---- .../export/DatasetsStatsDB.java | 46 ++----- .../export/ExecuteWorkflow.java | 12 +- .../export/ReadReportsListFromDatacite.java | 93 +++----------- .../export/UsageStatsExporter.java | 46 ------- .../datasets_usagestats_parameters.json | 114 +++++++++--------- .../datasetsusagestats/oozie_app/workflow.xml | 4 +- 15 files changed, 335 insertions(+), 263 deletions(-) create mode 100644 dhp-workflows/dhp-indicators/pom.xml create mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml mode change 100755 => 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml new file mode 100644 index 000000000..937795791 --- /dev/null +++ b/dhp-workflows/dhp-indicators/pom.xml @@ -0,0 +1,107 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-indicators + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-indicators + diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh new file mode 100755 index 000000000..0cad5792d --- /dev/null +++ b/dhp-workflows/dhp-indicators/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java new file mode 100644 index 000000000..61e6ef72c --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java @@ -0,0 +1,35 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.indicators.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos + */ +public class ExecuteWorkflow { + + private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + logger.info("Workflow Executed"); + } + +} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py new file mode 100644 index 000000000..e913df6ae --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py @@ -0,0 +1,5 @@ +#! /usr/bin/env python +import sys + +print "this is a Python script" +print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh new file mode 100644 index 000000000..78938c85a --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml new file mode 100644 index 000000000..2b8ed7d99 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml @@ -0,0 +1,58 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + testpython.py + python/testpython.py + + + + + + + Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index c623a12f0..b39c3ff9b 100755 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -19,7 +19,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.1.7-SNAPSHOT ../ 4.0.0 @@ -96,7 +96,7 @@ eu.dnetlib.dhp dhp-common - 1.2.4-SNAPSHOT + 1.1.7-SNAPSHOT jar diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index de9e44fbf..cab0bc83f 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -9,10 +9,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; import org.apache.log4j.Logger; @@ -32,7 +28,6 @@ public abstract class ConnectDB { private static String dbHiveUrl; private static String dbImpalaUrl; private static String datasetUsageStatsDBSchema; - private static String datasetsUsageStatsPermanentDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); private Statement stmt = null; @@ -42,7 +37,6 @@ public abstract class ConnectDB { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; - datasetsUsageStatsPermanentDBSchema = ExecuteWorkflow.datasetsUsageStatsPermanentDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema; Class.forName("org.apache.hive.jdbc.HiveDriver"); @@ -69,25 +63,14 @@ public abstract class ConnectDB { } public static String getDataSetUsageStatsDBSchema() { - String datePattern = "YYYYMMdd"; - DateFormat df = new SimpleDateFormat(datePattern); -// Get the today date using Calendar object. - Date today = Calendar.getInstance().getTime(); - String todayAsString = df.format(today); - - return ConnectDB.datasetUsageStatsDBSchema + "_" + todayAsString; + return ConnectDB.datasetUsageStatsDBSchema; } public static String getStatsDBSchema() { return ConnectDB.statsDBSchema; } - public static String getDatasetsUsagestatsPermanentDBSchema() { - return ConnectDB.datasetsUsageStatsPermanentDBSchema; - } - private static Connection connectHive() throws SQLException { - logger.info("trying to open Hive connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); @@ -107,18 +90,14 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened HIVE successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); -// Connection connection = DriverManager.getConnection(dbHiveUrl); -// logger.debug("Opened Hive successfully"); -// -// return connection; } private static Connection connectImpala() throws SQLException { - logger.info("trying to open Impala connection..."); + ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); cpds.setUser("dimitris.pierrakos"); @@ -137,12 +116,8 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened Impala successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); -// Connection connection = DriverManager.getConnection(dbHiveUrl); -// logger.debug("Opened Impala successfully"); -// -// return connection; } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java index baffa39e0..17661b99e 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -1,6 +1,8 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; +import java.sql.Connection; +import java.sql.SQLException; import java.sql.Statement; import org.slf4j.Logger; @@ -45,7 +47,7 @@ public class DatasetsStatsDB { try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating datacite usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -53,23 +55,6 @@ public class DatasetsStatsDB { logger.error("Failed to create database: " + e); throw new Exception("Failed to create database: " + e.toString(), e); } - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger - .info( - "Creating permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); - String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " - + ConnectDB.getDatasetsUsagestatsPermanentDBSchema(); - stmt.executeUpdate(createPermanentDatabase); - logger - .info( - "Created permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } } private void createTables() throws Exception { @@ -77,10 +62,10 @@ public class DatasetsStatsDB { stmt = ConnectDB.getHiveConnection().createStatement(); // Create Reports table - This table should exist - logger.info("Creating Reports Tmp Table"); + logger.info("Creating Reports Table"); String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp(reportid STRING, \n" + + ".datacitereports(reportid STRING, \n" + " name STRING, \n" + " source STRING,\n" + " release STRING,\n" @@ -94,10 +79,10 @@ public class DatasetsStatsDB { logger.info("Reports Table Created"); // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Tmp Table"); + logger.info("Creating DataSetsPerformance Table"); String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp(ds_type STRING,\n" + + ".datasetsperformance(ds_type STRING,\n" + " ds_title STRING,\n" + " yop STRING,\n" + " dataset_type STRING, \n" @@ -115,22 +100,7 @@ public class DatasetsStatsDB { + " CLUSTERED BY (ds_type)\n" + " into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Tmp Table Created"); - - logger.info("Creating Datacite Reports table"); - String createDataciteReportsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp STORED AS PARQUET"; - stmt.executeUpdate(createDataciteReportsTable); - logger.info("Datacite Reports Table created"); - - logger.info("Creating Datasets Performance table"); - String createDatasetPerformanceTable = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp STORED AS PARQUET"; - stmt.executeUpdate(createDatasetPerformanceTable); - logger.info("DatasetsPerformance Table created"); + logger.info("DataSetsPerformance Table Created"); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index ffa8b8199..b28578e4b 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -21,7 +21,6 @@ public class ExecuteWorkflow { static String dbHiveUrl; static String dbImpalaUrl; static String datasetUsageStatsDBSchema; - static String datasetsUsageStatsPermanentDBSchema; static String statsDBSchema; static boolean recreateDbAndTables; static boolean datasetsEmptyDirs; @@ -46,7 +45,6 @@ public class ExecuteWorkflow { dbHiveUrl = parser.get("dbHiveUrl"); dbImpalaUrl = parser.get("dbImpalaUrl"); datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); - datasetsUsageStatsPermanentDBSchema = parser.get("datasetsUsageStatsPermanentDBSchema"); statsDBSchema = parser.get("statsDBSchema"); if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) @@ -59,11 +57,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - +// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) +// finalTablesVisibleToImpala = true; +// else +// finalTablesVisibleToImpala = false; +// UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index e89e2e5a4..6e8c0e397 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -65,7 +65,7 @@ public class ReadReportsListFromDatacite { logger.info("Checking report with id " + reportID); String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports_tmp where reportid=?"; + + ".datacitereports where reportid=?"; PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); stGetReportID.setString(1, reportID); @@ -76,7 +76,7 @@ public class ReadReportsListFromDatacite { dropTmpReportsTable(); } else { String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports_tmp " + + " .datacitereports " + "SELECT\n" + " get_json_object(json, '$.report.id') AS reportid,\n" + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" @@ -105,7 +105,7 @@ public class ReadReportsListFromDatacite { ResultSet rstmpReportAll = stmt.getResultSet(); if (rstmpReportAll.next()) { String listDatasets = rstmpReportAll.getString(1); - logger.info("Adding uncompressed performance for " + reportID); + logger.info("No compressed performance found"); this.readDatasetsReport(listDatasets, reportID); } @@ -125,9 +125,6 @@ public class ReadReportsListFromDatacite { } public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { - logger.info("Reading Datasets performance for report " + reportId); - logger.info("Write Performance Report To File"); - ConnectDB.getHiveConnection().setAutoCommit(false); ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -154,7 +151,8 @@ public class ReadReportsListFromDatacite { fin.writeChar('\n'); fin.close(); - logger.info("Reading Performance Report From File..."); + logger.info("Write Compress Report To File"); + logger.info("Reading Compress Report From File..."); String sqlCreateTempTableForDatasets = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed (report_datasets array>,dataset_title:string, data_type:string, " @@ -177,7 +175,7 @@ public class ReadReportsListFromDatacite { stmt.execute(sqlCreateTempTableForDatasets); String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance_tmp SELECT dataset.dataset_id[0].value ds_type, " + + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + " dataset.dataset_title ds_title, " + " dataset.yop yop, " + " dataset.data_type dataset_type, " @@ -198,7 +196,7 @@ public class ReadReportsListFromDatacite { stmt.executeUpdate(sqlInsertToDatasetsPerformance); - logger.info("Datasets Performance Inserted for Report " + reportId); + logger.info("Datasets Performance Inserted "); stmt.execute("Drop table " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed"); @@ -296,93 +294,32 @@ public class ReadReportsListFromDatacite { } public void createUsageStatisticsTable() throws SQLException { + logger.info("Dropping Downloads Stats table"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Updating Datacite Reports table"); - String createDataciteReportsTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports " - + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports_tmp"; - stmt.executeUpdate(createDataciteReportsTable); - logger.info("Datacite Reports Table updated"); - - logger.info("Updating Datasets Performance table"); - String createDatasetPerformanceTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance " - + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance_tmp"; - stmt.executeUpdate(createDatasetPerformanceTable); - logger.info("DatasetsPerformance Table updated"); + String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_downloads"; + stmt.executeUpdate(dropDownloadsTable); logger.info("Creating Downloads Stats table"); String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads STORED AS PARQUET as " + + ".datacite_downloads as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-requests' "; + + "where metric_type='total-dataset-requests'"; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); logger.info("Creating Views Stats table"); - String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_views STORED AS PARQUET as " + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-investigations' "; + + "where metric_type='total-dataset-investigations'"; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); - - logger.info("Building Permanent Datasets Usage Stats DB"); - - logger.info("Dropping view datacitereports on permanent datacite usagestats DB"); - String sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacitereports on permanent datacite usagestats DB"); - - logger.info("Create view datacitereports on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - logger.info("Created view datacitereports on permanent datasets usagestats DB"); - - logger.info("Dropping view datasetsperformance on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - logger.info("Dropped view datasetsperformance on permanent datacite usagestats DB"); - - logger.info("Create view datasetsperformance on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - logger.info("Created view datasetsperformance on permanent datasets usagestats DB"); - - logger.info("Dropping view datacite_views on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacite_views on permanent datacite usagestats DB"); - - logger.info("Create view datacite_views on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - logger.info("Created view datacite_views on permanent datasets usagestats DB"); - - logger.info("Dropping view datacite_downloads on permanent datacite usagestats DB"); - sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - logger.info("Dropped view datacite_downloads on permanent datacite usagestats DB"); - - logger.info("Create view datacite_downloads on permanent datacite usagestats DB"); - sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads" - + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - logger.info("Created view datacite_downloads on permanent datasets usagestats DB"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Completed Building Permanent Datasets Usage Stats DB"); } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java old mode 100755 new mode 100644 index 8d6e24333..d96d7e875 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -2,7 +2,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; -import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; @@ -68,50 +67,5 @@ public class UsageStatsExporter { readReportsListFromDatacite.readReports(); logger.info("Reports Stored To DB"); readReportsListFromDatacite.createUsageStatisticsTable(); - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - } - - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; - stmt.executeUpdate(sql); - - stmt.close(); - try { - ConnectDB.getHiveConnection().close(); - } catch (Exception e) { - logger.info("Message at the end :" + e.getMessage()); - } } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json index f67651627..f8d51a882 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -1,62 +1,56 @@ [ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "uspdbs", - "paramLongName": "datasetsUsageStatsPermanentDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml index 22bf22c01..36c1ccea5 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -52,8 +52,6 @@ ${impalaJdbcUrl} --datasetUsageStatsDBSchema ${datasetUsageStatsDBSchema} - --datasetsUsageStatsPermanentDBSchema - ${datasetsUsageStatsPermanentDBSchema} --statsDBSchema ${statsDBSchema} --recreateDbAndTables -- 2.17.1 From 6cb4cbb75e42547bfb0ed6927469713463703750 Mon Sep 17 00:00:00 2001 From: Dimitris Date: Wed, 17 Feb 2021 14:56:13 +0200 Subject: [PATCH 06/12] Changes to logger.info --- .../export/ReadReportsListFromDatacite.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index 6e8c0e397..1b769bf53 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -105,7 +105,7 @@ public class ReadReportsListFromDatacite { ResultSet rstmpReportAll = stmt.getResultSet(); if (rstmpReportAll.next()) { String listDatasets = rstmpReportAll.getString(1); - logger.info("No compressed performance found"); + logger.info("Adding uncompressed performance for " + reportID); this.readDatasetsReport(listDatasets, reportID); } @@ -125,6 +125,9 @@ public class ReadReportsListFromDatacite { } public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { + logger.info("Reading Datasets performance for report " + reportId); + logger.info("Write Performance Report To File"); + ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -151,8 +154,7 @@ public class ReadReportsListFromDatacite { fin.writeChar('\n'); fin.close(); - logger.info("Write Compress Report To File"); - logger.info("Reading Compress Report From File..."); + logger.info("Reading Performance Report From File..."); String sqlCreateTempTableForDatasets = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed (report_datasets array>,dataset_title:string, data_type:string, " @@ -196,7 +198,7 @@ public class ReadReportsListFromDatacite { stmt.executeUpdate(sqlInsertToDatasetsPerformance); - logger.info("Datasets Performance Inserted "); + logger.info("Datasets Performance Inserted for Report " + reportId); stmt.execute("Drop table " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjsoncompressesed"); -- 2.17.1 From c90dd653c2845cbedd4e81716ad4bb58c8f5492f Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 4 Jun 2021 15:47:32 +0300 Subject: [PATCH 07/12] Commit datasets changes --- dhp-workflows/dhp-indicators/pom.xml | 107 ---------------- dhp-workflows/dhp-indicators/runworkflow.sh | 1 - .../indicators/export/ExecuteWorkflow.java | 35 ------ .../indicators/oozie_app/config-default.xml | 38 ------ .../indicators/oozie_app/python/testpython.py | 5 - .../indicators/oozie_app/python/testscript.sh | 2 - .../graph/indicators/oozie_app/workflow.xml | 58 --------- .../dhp-usage-datasets-stats-update/pom.xml | 4 +- .../datasetsusagestats/export/ConnectDB.java | 33 ++++- .../export/DatasetsStatsDB.java | 46 +++++-- .../export/ExecuteWorkflow.java | 12 +- .../export/ReadReportsListFromDatacite.java | 85 +++++++++++-- .../export/UsageStatsExporter.java | 46 +++++++ .../datasets_usagestats_parameters.json | 114 +++++++++--------- .../datasetsusagestats/oozie_app/workflow.xml | 4 +- 15 files changed, 258 insertions(+), 332 deletions(-) delete mode 100644 dhp-workflows/dhp-indicators/pom.xml delete mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh delete mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml mode change 100644 => 100755 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml deleted file mode 100644 index 937795791..000000000 --- a/dhp-workflows/dhp-indicators/pom.xml +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-indicators - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-indicators - diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh deleted file mode 100755 index 0cad5792d..000000000 --- a/dhp-workflows/dhp-indicators/runworkflow.sh +++ /dev/null @@ -1 +0,0 @@ -mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java b/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java deleted file mode 100644 index 61e6ef72c..000000000 --- a/dhp-workflows/dhp-indicators/src/main/java/eu/dnetlib/oa/graph/indicators/export/ExecuteWorkflow.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.indicators.export; - -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos - */ -public class ExecuteWorkflow { - - private static final Logger logger = LoggerFactory.getLogger(ExecuteWorkflow.class); - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - logger.info("Workflow Executed"); - } - -} diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py deleted file mode 100644 index e913df6ae..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testpython.py +++ /dev/null @@ -1,5 +0,0 @@ -#! /usr/bin/env python -import sys - -print "this is a Python script" -print "Python Interpreter Version: " + sys.version \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh deleted file mode 100644 index 78938c85a..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/python/testscript.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -echo "`date` hi" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml deleted file mode 100644 index 2b8ed7d99..000000000 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - ${jobTracker} - ${nameNode} - - - mapred.job.queue.name - ${queueName} - - - testpython.py - python/testpython.py - - - - - - - Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index b39c3ff9b..c623a12f0 100755 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -19,7 +19,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT ../ 4.0.0 @@ -96,7 +96,7 @@ eu.dnetlib.dhp dhp-common - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT jar diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index cab0bc83f..de9e44fbf 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -9,6 +9,10 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; import org.apache.log4j.Logger; @@ -28,6 +32,7 @@ public abstract class ConnectDB { private static String dbHiveUrl; private static String dbImpalaUrl; private static String datasetUsageStatsDBSchema; + private static String datasetsUsageStatsPermanentDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); private Statement stmt = null; @@ -37,6 +42,7 @@ public abstract class ConnectDB { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; + datasetsUsageStatsPermanentDBSchema = ExecuteWorkflow.datasetsUsageStatsPermanentDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema; Class.forName("org.apache.hive.jdbc.HiveDriver"); @@ -63,14 +69,25 @@ public abstract class ConnectDB { } public static String getDataSetUsageStatsDBSchema() { - return ConnectDB.datasetUsageStatsDBSchema; + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); + + return ConnectDB.datasetUsageStatsDBSchema + "_" + todayAsString; } public static String getStatsDBSchema() { return ConnectDB.statsDBSchema; } + public static String getDatasetsUsagestatsPermanentDBSchema() { + return ConnectDB.datasetsUsageStatsPermanentDBSchema; + } + private static Connection connectHive() throws SQLException { + logger.info("trying to open Hive connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); @@ -90,14 +107,18 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened HIVE successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Hive successfully"); +// +// return connection; } private static Connection connectImpala() throws SQLException { - + logger.info("trying to open Impala connection..."); ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); cpds.setUser("dimitris.pierrakos"); @@ -116,8 +137,12 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened Impala successfully"); return cpds.getConnection(); +// Connection connection = DriverManager.getConnection(dbHiveUrl); +// logger.debug("Opened Impala successfully"); +// +// return connection; } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java index 17661b99e..baffa39e0 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -1,8 +1,6 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; -import java.sql.Connection; -import java.sql.SQLException; import java.sql.Statement; import org.slf4j.Logger; @@ -47,7 +45,7 @@ public class DatasetsStatsDB { try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + logger.info("Creating datacite usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); stmt.executeUpdate(createDatabase); @@ -55,6 +53,23 @@ public class DatasetsStatsDB { logger.error("Failed to create database: " + e); throw new Exception("Failed to create database: " + e.toString(), e); } + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger + .info( + "Creating permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getDatasetsUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger + .info( + "Created permanent datasets usagestats DB: " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema()); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } } private void createTables() throws Exception { @@ -62,10 +77,10 @@ public class DatasetsStatsDB { stmt = ConnectDB.getHiveConnection().createStatement(); // Create Reports table - This table should exist - logger.info("Creating Reports Table"); + logger.info("Creating Reports Tmp Table"); String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" + + ".datacitereports_tmp(reportid STRING, \n" + " name STRING, \n" + " source STRING,\n" + " release STRING,\n" @@ -79,10 +94,10 @@ public class DatasetsStatsDB { logger.info("Reports Table Created"); // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Table"); + logger.info("Creating DataSetsPerformance Tmp Table"); String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance(ds_type STRING,\n" + + ".datasetsperformance_tmp(ds_type STRING,\n" + " ds_title STRING,\n" + " yop STRING,\n" + " dataset_type STRING, \n" @@ -100,7 +115,22 @@ public class DatasetsStatsDB { + " CLUSTERED BY (ds_type)\n" + " into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Table Created"); + logger.info("DataSetsPerformance Tmp Table Created"); + + logger.info("Creating Datacite Reports table"); + String createDataciteReportsTable = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table created"); + + logger.info("Creating Datasets Performance table"); + String createDatasetPerformanceTable = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance LIKE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table created"); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index b28578e4b..ffa8b8199 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -21,6 +21,7 @@ public class ExecuteWorkflow { static String dbHiveUrl; static String dbImpalaUrl; static String datasetUsageStatsDBSchema; + static String datasetsUsageStatsPermanentDBSchema; static String statsDBSchema; static boolean recreateDbAndTables; static boolean datasetsEmptyDirs; @@ -45,6 +46,7 @@ public class ExecuteWorkflow { dbHiveUrl = parser.get("dbHiveUrl"); dbImpalaUrl = parser.get("dbImpalaUrl"); datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); + datasetsUsageStatsPermanentDBSchema = parser.get("datasetsUsageStatsPermanentDBSchema"); statsDBSchema = parser.get("statsDBSchema"); if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) @@ -57,11 +59,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; -// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) -// finalTablesVisibleToImpala = true; -// else -// finalTablesVisibleToImpala = false; -// + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else + finalTablesVisibleToImpala = false; + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java index 1b769bf53..e89e2e5a4 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -65,7 +65,7 @@ public class ReadReportsListFromDatacite { logger.info("Checking report with id " + reportID); String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports where reportid=?"; + + ".datacitereports_tmp where reportid=?"; PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); stGetReportID.setString(1, reportID); @@ -76,7 +76,7 @@ public class ReadReportsListFromDatacite { dropTmpReportsTable(); } else { String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports " + + " .datacitereports_tmp " + "SELECT\n" + " get_json_object(json, '$.report.id') AS reportid,\n" + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" @@ -127,7 +127,7 @@ public class ReadReportsListFromDatacite { public void readDatasetsReport(String prettyDatasetsReports, String reportId) throws Exception { logger.info("Reading Datasets performance for report " + reportId); logger.info("Write Performance Report To File"); - + ConnectDB.getHiveConnection().setAutoCommit(false); ObjectMapper objectMapper = new ObjectMapper(); JsonNode jsonNode = objectMapper.readValue(prettyDatasetsReports, JsonNode.class); String datasetsReports = jsonNode.toString(); @@ -177,7 +177,7 @@ public class ReadReportsListFromDatacite { stmt.execute(sqlCreateTempTableForDatasets); String sqlInsertToDatasetsPerformance = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance SELECT dataset.dataset_id[0].value ds_type, " + + ".datasetsperformance_tmp SELECT dataset.dataset_id[0].value ds_type, " + " dataset.dataset_title ds_title, " + " dataset.yop yop, " + " dataset.data_type dataset_type, " @@ -296,32 +296,93 @@ public class ReadReportsListFromDatacite { } public void createUsageStatisticsTable() throws SQLException { - logger.info("Dropping Downloads Stats table"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String dropDownloadsTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads"; - stmt.executeUpdate(dropDownloadsTable); + + logger.info("Updating Datacite Reports table"); + String createDataciteReportsTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports_tmp"; + stmt.executeUpdate(createDataciteReportsTable); + logger.info("Datacite Reports Table updated"); + + logger.info("Updating Datasets Performance table"); + String createDatasetPerformanceTable = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance " + + "SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance_tmp"; + stmt.executeUpdate(createDatasetPerformanceTable); + logger.info("DatasetsPerformance Table updated"); logger.info("Creating Downloads Stats table"); String createDownloadsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacite_downloads as " + + ".datacite_downloads STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-requests'"; + + "where metric_type='total-dataset-requests' "; stmt.executeUpdate(createDownloadsTable); logger.info("Downloads Stats table created"); logger.info("Creating Views Stats table"); - String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views as " + String createViewsTable = "CREATE TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacite_views STORED AS PARQUET as " + "SELECT 'Datacite' source, d.id repository_id, od.id result_id, regexp_replace(substring(string(period_end),0,7),'-','/') date, count, '0' openaire " + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance " + "JOIN " + ConnectDB.getStatsDBSchema() + ".datasource d on name=platform " + "JOIN " + ConnectDB.getStatsDBSchema() + ".result_oids od on string(ds_type)=od.oid " - + "where metric_type='total-dataset-investigations'"; + + "where metric_type='total-dataset-investigations' "; stmt.executeUpdate(createViewsTable); logger.info("Views Stats table created"); + + logger.info("Building Permanent Datasets Usage Stats DB"); + + logger.info("Dropping view datacitereports on permanent datacite usagestats DB"); + String sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacitereports on permanent datacite usagestats DB"); + + logger.info("Create view datacitereports on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + logger.info("Created view datacitereports on permanent datasets usagestats DB"); + + logger.info("Dropping view datasetsperformance on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Dropped view datasetsperformance on permanent datacite usagestats DB"); + + logger.info("Create view datasetsperformance on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + logger.info("Created view datasetsperformance on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_views on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_views on permanent datacite usagestats DB"); + + logger.info("Create view datacite_views on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_views on permanent datasets usagestats DB"); + + logger.info("Dropping view datacite_downloads on permanent datacite usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Dropped view datacite_downloads on permanent datacite usagestats DB"); + + logger.info("Create view datacite_downloads on permanent datacite usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads" + + " AS SELECT * FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + logger.info("Created view datacite_downloads on permanent datasets usagestats DB"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Completed Building Permanent Datasets Usage Stats DB"); } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java old mode 100644 new mode 100755 index d96d7e875..8d6e24333 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -2,6 +2,7 @@ package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; +import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; @@ -67,5 +68,50 @@ public class UsageStatsExporter { readReportsListFromDatacite.readReports(); logger.info("Reports Stored To DB"); readReportsListFromDatacite.createUsageStatisticsTable(); + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_downloads"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacite_views"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datacitereports"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDatasetsUsagestatsPermanentDBSchema() + ".datasetsperformance"; + stmt.executeUpdate(sql); + + stmt.close(); + try { + ConnectDB.getHiveConnection().close(); + } catch (Exception e) { + logger.info("Message at the end :" + e.getMessage()); + } } } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json index f8d51a882..f67651627 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -1,56 +1,62 @@ [ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "datasetsUsageStatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml index 36c1ccea5..22bf22c01 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + hiveMetastoreUris @@ -52,6 +52,8 @@ ${impalaJdbcUrl} --datasetUsageStatsDBSchema ${datasetUsageStatsDBSchema} + --datasetsUsageStatsPermanentDBSchema + ${datasetsUsageStatsPermanentDBSchema} --statsDBSchema ${statsDBSchema} --recreateDbAndTables -- 2.17.1 From da1f123d7bd0304775c5966c1ff8cc36984e9825 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 4 Jun 2021 16:36:35 +0300 Subject: [PATCH 08/12] Add indicators wf --- .../dhp-indicators/nb-configuration.xml | 18 +++ dhp-workflows/dhp-indicators/pom.xml | 107 ++++++++++++++++++ dhp-workflows/dhp-indicators/runworkflow.sh | 1 + .../indicators/oozie_app/config-default.xml | 34 ++++++ .../scripts/createIndicatorsTables.sql | 7 ++ .../oozie_app/scripts/indicators.sh | 29 +++++ .../graph/indicators/oozie_app/workflow.xml | 101 +++++++++++++++++ 7 files changed, 297 insertions(+) create mode 100644 dhp-workflows/dhp-indicators/nb-configuration.xml create mode 100755 dhp-workflows/dhp-indicators/pom.xml create mode 100755 dhp-workflows/dhp-indicators/runworkflow.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh create mode 100644 dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-indicators/nb-configuration.xml b/dhp-workflows/dhp-indicators/nb-configuration.xml new file mode 100644 index 000000000..a65c4514a --- /dev/null +++ b/dhp-workflows/dhp-indicators/nb-configuration.xml @@ -0,0 +1,18 @@ + + + + + + JDK_1.8 + + diff --git a/dhp-workflows/dhp-indicators/pom.xml b/dhp-workflows/dhp-indicators/pom.xml new file mode 100755 index 000000000..72ad153f1 --- /dev/null +++ b/dhp-workflows/dhp-indicators/pom.xml @@ -0,0 +1,107 @@ + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + ../ + + 4.0.0 + dhp-indicators + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + 2.7.4 + jar + + + eu.dnetlib.dhp + dhp-common + 1.1.7-SNAPSHOT + jar + + + com.mchange + c3p0 + 0.9.5.2 + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + org.slf4j + slf4j-api + 1.7.26 + jar + + + dhp-indicators + diff --git a/dhp-workflows/dhp-indicators/runworkflow.sh b/dhp-workflows/dhp-indicators/runworkflow.sh new file mode 100755 index 000000000..0cad5792d --- /dev/null +++ b/dhp-workflows/dhp-indicators/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/indicators \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml new file mode 100644 index 000000000..6d255a7f4 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql new file mode 100644 index 000000000..0a96063cb --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -0,0 +1,7 @@ +create table TARGET.funders_publications stored as parquet as +select f.id as id, count(pr.result) as total_pubs from SOURCE.funder f +join SOURCE.project p on f.name=p.funder +join SOURCE.project_results_publication pr on pr.project_results=p.id group by f.id, f.name; + + +compute stats TARGET.funders_publications; \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh new file mode 100644 index 000000000..306609e8a --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh @@ -0,0 +1,29 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating indicators database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat createIndicatorsTables.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Indicators Database created" + + +echo "Updating Shadow indicators DB" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Indicators Shadow DB ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml new file mode 100644 index 000000000..ec917b9a4 --- /dev/null +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml @@ -0,0 +1,101 @@ + + + + stats_db_name + the source stats database name + + + indicators_db_name + the target indicators database name + + + indicators_shadow_db_name + the name of the shadow schema + + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + + + + + + + ${jobTracker} + ${nameNode} + indicators.sh + ${stats_db_name} + ${indicators_db_name} + ${indicators_shadow_db_name} + ${wf:appPath()}/scripts/createIndicatorsTables.sql + scripts/indicators.sh + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + \ No newline at end of file -- 2.17.1 From 2cd0514397e4ae776abe09e3c9909a9704b87600 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 22 Jun 2021 09:26:49 +0300 Subject: [PATCH 09/12] Update createIndicatorsTables.sql --- .../scripts/createIndicatorsTables.sql | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql index 0a96063cb..8741c5bff 100644 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -3,5 +3,48 @@ select f.id as id, count(pr.result) as total_pubs from SOURCE.funder f join SOURCE.project p on f.name=p.funder join SOURCE.project_results_publication pr on pr.project_results=p.id group by f.id, f.name; +create table TARGET.indi_pub_green_oa stored as parquet as +select distinct p.id, coalesce(green_oa, 0) as green_oa +from SOURCE.publication p +left outer join ( +select p.id, 1 as green_oa +from SOURCE.publication p +join SOURCE.result_instance ri on ri.id = p.id +join SOURCE.datasource on datasource.id = ri.hostedby +where SOURCE.datasource.type like '%Repository%' +and (ri.accessright = 'Open Access' +or ri.accessright = 'Embargo')) tmp +on p.id= tmp.id; + + +create table TARGET.indi_pub_grey_lit stored as parquet as +select distinct p.id, coalesce(grey_lit, 0) as grey_lit +from SOURCE.publication p +left outer join ( +select p.id, 1 as grey_lit +from SOURCE.publication p +join SOURCE.result_classifications rt on rt.id = p.id +where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and +not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; + +create table TARGET.indi_pub_doi_from_crossref stored as parquet as +select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref +from SOURCE.publication p +left outer join +(select ri.id, 1 as doi_from_crossref from SOURCE.result_instance ri +join SOURCE.datasource d on d.id = ri.collectedfrom +where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp +on tmp.id=p.id; + +create table TARGET.indi_pub_gold_oa stored as parquet as +select distinct p.id, coalesce(gold_oa, 0) as gold_oa +from SOURCE.publication p +left outer join ( +select p.id, 1 as gold_oa +from SOURCE.publication p +join SOURCE.result_instance ri on ri.id = p.id +join SOURCE.datasource on datasource.id = ri.hostedby +where SOURCE.datasource.id like '%doajarticles%') tmp +on p.id= tmp.id; compute stats TARGET.funders_publications; \ No newline at end of file -- 2.17.1 From 50cda834bdf758d774b86b4d445423c53ed4dc0a Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 22 Jun 2021 09:37:00 +0300 Subject: [PATCH 10/12] Update createIndicatorsTables.sql --- .../oozie_app/scripts/createIndicatorsTables.sql | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql index 8741c5bff..16635e085 100644 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -1,8 +1,3 @@ -create table TARGET.funders_publications stored as parquet as -select f.id as id, count(pr.result) as total_pubs from SOURCE.funder f -join SOURCE.project p on f.name=p.funder -join SOURCE.project_results_publication pr on pr.project_results=p.id group by f.id, f.name; - create table TARGET.indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from SOURCE.publication p @@ -47,4 +42,7 @@ join SOURCE.datasource on datasource.id = ri.hostedby where SOURCE.datasource.id like '%doajarticles%') tmp on p.id= tmp.id; -compute stats TARGET.funders_publications; \ No newline at end of file +compute stats TARGET.indi_pub_green_oa; +compute stats TARGET.indi_pub_grey_lit; +compute stats TARGET.indi_pub_doi_from_crossref; +compute stats TARGET.indi_pub_gold_oa; \ No newline at end of file -- 2.17.1 From 6418208d57932e7b9cbd44722427458c4f963834 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 22 Jun 2021 10:44:41 +0300 Subject: [PATCH 11/12] Update createIndicatorsTables.sql --- .../indicators/oozie_app/scripts/createIndicatorsTables.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql index 16635e085..e900c46c7 100644 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -11,7 +11,6 @@ and (ri.accessright = 'Open Access' or ri.accessright = 'Embargo')) tmp on p.id= tmp.id; - create table TARGET.indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from SOURCE.publication p -- 2.17.1 From bc40663b61777f601fa6094883cdfe99fbff8084 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 22 Jun 2021 14:15:32 +0300 Subject: [PATCH 12/12] Update createIndicatorsTables.sql --- .../indicators/oozie_app/scripts/createIndicatorsTables.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql index e900c46c7..fe9eaec04 100644 --- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql +++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTables.sql @@ -19,7 +19,7 @@ select p.id, 1 as grey_lit from SOURCE.publication p join SOURCE.result_classifications rt on rt.id = p.id where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and -not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; +not exists (select 1 from SOURCE.result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref -- 2.17.1