diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
new file mode 100644
index 000000000..14b543a57
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
@@ -0,0 +1,79 @@
+
+
+
+
+
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.1.7-SNAPSHOT
+
+ 4.0.0
+ dhp-usage-datasets-stats-update
+
+
+ UTF-8
+ UTF-8
+ 0.13.1-cdh5.2.1
+ 2.5.0-cdh5.2.1
+
+
+
+
+ org.apache.spark
+ spark-core_2.11
+ 2.2.0
+
+
+ org.apache.spark
+ spark-sql_2.11
+ 2.4.5
+
+
+ com.googlecode.json-simple
+ json-simple
+ 1.1.1
+
+
+ org.json
+ json
+ 20180130
+ jar
+
+
+ org.apache.hive
+ hive-jdbc
+ ${cdh.hive.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${cdh.hadoop.version}
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ c3p0
+ c3p0
+ 0.9.1.2
+ jar
+
+
+ dhp-usage-datasets-stats-update
+
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
new file mode 100644
index 000000000..e6da7eff3
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
@@ -0,0 +1,202 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Properties;
+
+import org.apache.log4j.Logger;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+
+public abstract class ConnectDB {
+
+ public static Connection DB_HIVE_CONNECTION;
+ public static Connection DB_IMPALA_CONNECTION;
+
+ private static String dbHiveUrl;
+ private static String dbImpalaUrl;
+ private static String datasetUsageStatsDBSchema;
+ private static String statsDBSchema;
+ private final static Logger logger = Logger.getLogger(ConnectDB.class);
+ private Statement stmt = null;
+
+ static void init() throws ClassNotFoundException {
+
+ dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
+ dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
+ datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema;
+ statsDBSchema = ExecuteWorkflow.statsDBSchema;
+
+ Class.forName("org.apache.hive.jdbc.HiveDriver");
+ }
+
+ public static Connection getHiveConnection() throws SQLException {
+ if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
+ return DB_HIVE_CONNECTION;
+ } else {
+ DB_HIVE_CONNECTION = connectHive();
+
+ return DB_HIVE_CONNECTION;
+ }
+ }
+
+ public static Connection getImpalaConnection() throws SQLException {
+ if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
+ return DB_IMPALA_CONNECTION;
+ } else {
+ DB_IMPALA_CONNECTION = connectImpala();
+
+ return DB_IMPALA_CONNECTION;
+ }
+ }
+
+ public static String getDataSetUsageStatsDBSchema() {
+ return ConnectDB.datasetUsageStatsDBSchema;
+ }
+
+ public static String getStatsDBSchema() {
+ return ConnectDB.statsDBSchema;
+ }
+
+ private static Connection connectHive() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbHiveUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+
+ logger.info("Opened database successfully");
+
+ return cpds.getConnection();
+
+ }
+
+ private static Connection connectImpala() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbImpalaUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+
+ logger.info("Opened database successfully");
+ return cpds.getConnection();
+
+ }
+
+ private void createDatabase() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
+ stmt.executeUpdate(dropDatabase);
+ } catch (Exception e) {
+ logger.error("Failed to drop database: " + e);
+ throw new Exception("Failed to drop database: " + e.toString(), e);
+ }
+
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
+ }
+ }
+
+ private void createTables() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ // Create Piwiklog table - This table should exist
+ String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) "
+ + "into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePiwikLog);
+
+ /////////////////////////////////////////
+ // Rule for duplicate inserts @ piwiklog
+ /////////////////////////////////////////
+
+ String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePortalLog);
+
+ //////////////////////////////////////////////////
+ // Rule for duplicate inserts @ process_portal_log
+ //////////////////////////////////////////////////
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+}
+/*
+CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING,
+ name STRING,
+ source STRING,
+ release STRING,
+ createdby STRING,
+ report_end_date STRING,
+ report_start_date STRING)
+ CLUSTERED BY (reportid)
+ into 100 buckets stored as orc tblproperties('transactional'='true');
+*/
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
new file mode 100644
index 000000000..196238ea2
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
@@ -0,0 +1,97 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import com.google.gson.JsonObject;
+import java.util.ArrayList;
+import java.util.Iterator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * @author dpie
+ */
+public class DownloadReportsListFromDatacite {
+
+ private String dataciteBaseURL;
+ private String dataciteReportPath;
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception {
+
+ this.dataciteBaseURL = dataciteBaseURL;
+ this.dataciteReportPath = dataciteReportPath;
+ }
+
+ public void downloadReportsList() throws ParseException {
+ StringBuilder responseStrBuilder = new StringBuilder();
+
+ Gson gson = new Gson();
+
+ try {
+ BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream());
+ logger.info("Downloading from " + dataciteBaseURL);
+
+ BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ String inputStr;
+
+ while ((inputStr = streamReader.readLine()) != null) {
+ responseStrBuilder.append(inputStr);
+ }
+ } catch (IOException e) {
+ logger.info(e.getMessage());
+ }
+ JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class);
+ JsonArray dataArray = jsonObject.getAsJsonArray("reports");
+ ArrayList reportsList = new ArrayList();
+ for (JsonElement element : dataArray) {
+ reportsList.add(element.getAsJsonObject().get("id").getAsString());
+ }
+
+ Iterator it = reportsList.iterator();
+ while (it.hasNext()) {
+ String reportId = it.next().toString();
+ String url = dataciteBaseURL + reportId;
+
+ try {
+ BufferedInputStream in = new BufferedInputStream(new URL(url).openStream());
+ BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ String inputStr;
+ StringBuilder responseStrBuilder2 = new StringBuilder();
+ while ((inputStr = streamReader.readLine()) != null) {
+ responseStrBuilder2.append(inputStr);
+ }
+ FileSystem fs = FileSystem.get(new Configuration());
+ FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"),
+ true);
+ byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ fin.close();
+
+ fin.close();
+ } catch (IOException e) {
+ System.out.println(e);
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
new file mode 100644
index 000000000..7b3db3115
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
@@ -0,0 +1,70 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.BasicConfigurator;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class ExecuteWorkflow {
+
+ static String dataciteBaseURL;
+ static String dataciteReportPath;
+ static String dbHiveUrl;
+ static String dbImpalaUrl;
+ static String datasetUsageStatsDBSchema;
+ static String statsDBSchema;
+ static boolean recreateDbAndTables;
+ static boolean datasetsEmptyDirs;
+ static boolean finalTablesVisibleToImpala;
+
+
+ public static void main(String args[]) throws Exception {
+
+ // Sending the logs to the console
+ BasicConfigurator.configure();
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ UsageStatsExporter.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json")));
+ parser.parseArgument(args);
+
+ // Setting up the initial parameters
+ dataciteBaseURL = parser.get("dataciteBaseURL");
+ dataciteReportPath = parser.get("dataciteReportPath");
+ dbHiveUrl = parser.get("dbHiveUrl");
+ dbImpalaUrl = parser.get("dbImpalaUrl");
+ datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema");
+ statsDBSchema = parser.get("statsDBSchema");
+
+ if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
+ recreateDbAndTables = true;
+ else
+ recreateDbAndTables = false;
+
+ if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true"))
+ datasetsEmptyDirs = true;
+ else
+ datasetsEmptyDirs = false;
+
+ if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
+ finalTablesVisibleToImpala = true;
+ else
+ finalTablesVisibleToImpala = false;
+
+ UsageStatsExporter usagestatsExport = new UsageStatsExporter();
+ usagestatsExport.export();
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
new file mode 100644
index 000000000..28c4f30a1
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
@@ -0,0 +1,236 @@
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Main class for downloading and processing Usage statistics
+ *
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class UsageStatsExporter {
+
+ private Statement stmt = null;
+
+ public UsageStatsExporter() {
+
+ }
+
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ private void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath);
+ dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true);
+
+ logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath));
+
+ }
+
+ public void export() throws Exception {
+
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
+ ConnectDB.getHiveConnection();
+
+ if (ExecuteWorkflow.recreateDbAndTables) {
+ createDatabase();
+ createTables();
+ reCreateLogDirs();
+ }
+ logger.info("Initializing the download logs module");
+ DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath);
+
+ if (ExecuteWorkflow.datasetsEmptyDirs) {
+ logger.info("Downloading Reports List From Datacite");
+ drfd.downloadReportsList();
+ logger.info("Reports List has been downloaded");
+ }
+ }
+
+ private void createDatabase() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
+ stmt.executeUpdate(dropDatabase);
+ } catch (Exception e) {
+ logger.error("Failed to drop database: " + e);
+ throw new Exception("Failed to drop database: " + e.toString(), e);
+ }
+
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
+ }
+ }
+
+ private void createTables() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ // Create Reports table - This table should exist
+ String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".datacitereports(reportid STRING, \n"
+ + " name STRING, \n"
+ + " source STRING,\n"
+ + " release STRING,\n"
+ + " createdby STRING,\n"
+ + " report_end_date STRING,\n"
+ + " report_start_date STRING)\n"
+ + " CLUSTERED BY (reportid)\n"
+ + " into 100 buckets stored as orc tblproperties('transactional'='true')";
+
+ stmt.executeUpdate(sqlCreateTableDataciteeReports);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+// runImpalaQuery();
+/*
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+
+ logger.info("Re-creating database and tables");
+
+ logger.info("Initializing the download logs module");
+ PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
+
+ if (ExecuteWorkflow.piwikEmptyDirs) {
+ logger.info("Recreating Piwik log directories");
+ piwikstatsdb.reCreateLogDirs();
+ }
+
+ // Downloading piwik logs (also managing directory creation)
+ if (ExecuteWorkflow.downloadPiwikLogs) {
+ logger.info("Downloading piwik logs");
+ piwd
+ .GetOpenAIRELogs(
+ ExecuteWorkflow.repoLogPath,
+ ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
+ }
+ logger.info("Downloaded piwik logs");
+
+ // Create DB tables, insert/update statistics
+ String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
+ piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
+
+ if (ExecuteWorkflow.processPiwikLogs) {
+ logger.info("Processing logs");
+ piwikstatsdb.processLogs();
+ }
+
+ logger.info("Creating LaReferencia tables");
+ LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
+ ExecuteWorkflow.lareferenciaAuthToken);
+
+ if (ExecuteWorkflow.laReferenciaEmptyDirs) {
+ logger.info("Recreating LaReferencia log directories");
+ lrf.reCreateLogDirs();
+ }
+
+ if (ExecuteWorkflow.downloadLaReferenciaLogs) {
+ logger.info("Downloading LaReferencia logs");
+ lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
+ logger.info("Downloaded LaReferencia logs");
+ }
+ LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
+
+ if (ExecuteWorkflow.processLaReferenciaLogs) {
+ logger.info("Processing LaReferencia logs");
+ lastats.processLogs();
+ logger.info("LaReferencia logs done");
+ }
+
+ IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
+ if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
+ logger.info("Creating Irus Stats tables");
+ irusstats.createTables();
+ logger.info("Created Irus Stats tables");
+
+ logger.info("Re-create log dirs");
+ irusstats.reCreateLogDirs();
+ logger.info("Re-created log dirs");
+ }
+
+ if (ExecuteWorkflow.irusDownloadReports) {
+ irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
+ }
+ if (ExecuteWorkflow.irusProcessStats) {
+ irusstats.processIrusStats();
+ logger.info("Irus done");
+ }
+
+ SarcStats sarcStats = new SarcStats();
+ if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
+ sarcStats.reCreateLogDirs();
+ }
+ if (ExecuteWorkflow.sarcDownloadReports) {
+ sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ }
+ if (ExecuteWorkflow.sarcProcessStats) {
+ sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ sarcStats.finalizeSarcStats();
+ }
+ logger.info("Sarc done");
+
+ // finalize usagestats
+ if (ExecuteWorkflow.finalizeStats) {
+ piwikstatsdb.finalizeStats();
+ logger.info("Finalized stats");
+ }
+
+ // Make the tables available to Impala
+ if (ExecuteWorkflow.finalTablesVisibleToImpala) {
+ logger.info("Making tables visible to Impala");
+ invalidateMetadata();
+ }
+
+ logger.info("End");
+ */
+}
+/*
+ private void invalidateMetadata() throws SQLException {
+ Statement stmt = null;
+
+ stmt = ConnectDB.getImpalaConnection().createStatement();
+
+ String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ }
+ */
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json
new file mode 100644
index 000000000..f8d51a882
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json
@@ -0,0 +1,56 @@
+[
+ {
+ "paramName": "dbu",
+ "paramLongName": "dataciteBaseURL",
+ "paramDescription": "URL of Datacite Reports Endpoint",
+ "paramRequired": true
+ },
+ {
+ "paramName": "drp",
+ "paramLongName": "dataciteReportPath",
+ "paramDescription": "Path for Datacite Reports",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbhu",
+ "paramLongName": "dbHiveUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbiu",
+ "paramLongName": "dbImpalaUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dusdbs",
+ "paramLongName": "datasetUsageStatsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "sdbs",
+ "paramLongName": "statsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rdbt",
+ "paramLongName": "recreateDbAndTables",
+ "paramDescription": "Re-create database and initial tables?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pwed",
+ "paramLongName": "datasetsEmptyDirs",
+ "paramDescription": "Empty piwik directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ftvi",
+ "paramLongName": "finalTablesVisibleToImpala",
+ "paramDescription": "Make the dataset_usage_stats, visible to Impala",
+ "paramRequired": true
+ }
+]
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml
new file mode 100644
index 000000000..b5c807378
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml
@@ -0,0 +1,38 @@
+
+
+ jobTracker
+ ${jobTracker}
+
+
+ nameNode
+ ${nameNode}
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hiveMetastoreUris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ hiveJdbcUrl
+ jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1
+
+
+ impalaJdbcUrl
+ jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl;
+
+
+ oozie.wf.workflow.notification.url
+ {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status
+
+
+ oozie.use.system.libpath
+ true
+
+
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml
new file mode 100644
index 000000000..3a81e497d
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml
@@ -0,0 +1,70 @@
+
+
+
+ hiveMetastoreUris
+ Hive server metastore URIs
+
+
+ hiveJdbcUrl
+ Hive server jdbc url
+
+
+ impalaJdbcUrl
+ Impala server jdbc url
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ hive.metastore.uris
+ ${hiveMetastoreUris}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow
+ --dataciteBaseURL
+ ${dataciteBaseURL}
+ --dataciteReportPath
+ ${dataciteReportPath}
+ --dbHiveUrl
+ ${hiveJdbcUrl}
+ --dbImpalaUrl
+ ${impalaJdbcUrl}
+ --datasetUsageStatsDBSchema
+ ${datasetUsageStatsDBSchema}
+ --statsDBSchema
+ ${statsDBSchema}
+ --recreateDbAndTables
+ ${recreateDbAndTables}
+ --datasetsEmptyDirs
+ ${datasetsEmptyDirs}
+ --finalTablesVisibleToImpala
+ ${finalTablesVisibleToImpala}
+
+
+
+
+
+
+
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
new file mode 100644
index 000000000..338b2a2c5
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@@ -0,0 +1,79 @@
+
+
+
+
+
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.1.7-SNAPSHOT
+
+ 4.0.0
+ dhp-usage-raw-data-update
+
+
+ UTF-8
+ UTF-8
+ 0.13.1-cdh5.2.1
+ 2.5.0-cdh5.2.1
+
+
+
+
+ org.apache.spark
+ spark-core_2.11
+ 2.2.0
+
+
+ org.apache.spark
+ spark-sql_2.11
+ 2.4.5
+
+
+ com.googlecode.json-simple
+ json-simple
+ 1.1.1
+
+
+ org.json
+ json
+ 20180130
+ jar
+
+
+ org.apache.hive
+ hive-jdbc
+ ${cdh.hive.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${cdh.hadoop.version}
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ c3p0
+ c3p0
+ 0.9.1.2
+ jar
+
+
+ dhp-usage-raw-data-update
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java
new file mode 100644
index 000000000..f76644c83
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java
@@ -0,0 +1,125 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Properties;
+
+import org.apache.log4j.Logger;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+
+public abstract class ConnectDB {
+
+ public static Connection DB_HIVE_CONNECTION;
+ public static Connection DB_IMPALA_CONNECTION;
+
+ private static String dbHiveUrl;
+ private static String dbImpalaUrl;
+ private static String usageStatsDBSchema;
+ private static String statsDBSchema;
+ private final static Logger log = Logger.getLogger(ConnectDB.class);
+
+ static void init() throws ClassNotFoundException {
+
+ dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
+ dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
+ usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
+ statsDBSchema = ExecuteWorkflow.statsDBSchema;
+
+ Class.forName("org.apache.hive.jdbc.HiveDriver");
+ }
+
+ public static Connection getHiveConnection() throws SQLException {
+ if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
+ return DB_HIVE_CONNECTION;
+ } else {
+ DB_HIVE_CONNECTION = connectHive();
+
+ return DB_HIVE_CONNECTION;
+ }
+ }
+
+ public static Connection getImpalaConnection() throws SQLException {
+ if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
+ return DB_IMPALA_CONNECTION;
+ } else {
+ DB_IMPALA_CONNECTION = connectImpala();
+
+ return DB_IMPALA_CONNECTION;
+ }
+ }
+
+ public static String getUsageStatsDBSchema() {
+ return ConnectDB.usageStatsDBSchema;
+ }
+
+ public static String getStatsDBSchema() {
+ return ConnectDB.statsDBSchema;
+ }
+
+ private static Connection connectHive() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbHiveUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+ return cpds.getConnection();
+
+ }
+
+ private static Connection connectImpala() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbImpalaUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+
+ return cpds.getConnection();
+
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java
new file mode 100644
index 000000000..774dcf0b7
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java
@@ -0,0 +1,202 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+import java.util.Date;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.BasicConfigurator;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class ExecuteWorkflow {
+
+ static String matomoAuthToken;
+ static String matomoBaseURL;
+ static String repoLogPath;
+ static String portalLogPath;
+ static String portalMatomoID;
+ static String irusUKBaseURL;
+ static String irusUKReportPath;
+ static String sarcsReportPathArray;
+ static String sarcsReportPathNonArray;
+ static String lareferenciaLogPath;
+ static String lareferenciaBaseURL;
+ static String lareferenciaAuthToken;
+ static String dbHiveUrl;
+ static String dbImpalaUrl;
+ static String usageStatsDBSchema;
+ static String statsDBSchema;
+ static boolean recreateDbAndTables;
+
+ static boolean piwikEmptyDirs;
+ static boolean downloadPiwikLogs;
+ static boolean processPiwikLogs;
+
+ static Calendar startingLogPeriod;
+ static Calendar endingLogPeriod;
+ static int numberOfPiwikIdsToDownload;
+ static int numberOfSiteIdsToDownload;
+
+ static boolean laReferenciaEmptyDirs;
+ static boolean downloadLaReferenciaLogs;
+ static boolean processLaReferenciaLogs;
+
+ static boolean irusCreateTablesEmptyDirs;
+ static boolean irusDownloadReports;
+ static boolean irusProcessStats;
+ static int irusNumberOfOpendoarsToDownload;
+
+ static boolean sarcCreateTablesEmptyDirs;
+ static boolean sarcDownloadReports;
+ static boolean sarcProcessStats;
+ static int sarcNumberOfIssnToDownload;
+
+ static boolean finalizeStats;
+ static boolean finalTablesVisibleToImpala;
+
+ static int numberOfDownloadThreads;
+
+ public static void main(String args[]) throws Exception {
+
+ // Sending the logs to the console
+ BasicConfigurator.configure();
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ UsageStatsExporter.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json")));
+ parser.parseArgument(args);
+
+ // Setting up the initial parameters
+ matomoAuthToken = parser.get("matomoAuthToken");
+ matomoBaseURL = parser.get("matomoBaseURL");
+ repoLogPath = parser.get("repoLogPath");
+ portalLogPath = parser.get("portalLogPath");
+ portalMatomoID = parser.get("portalMatomoID");
+ irusUKBaseURL = parser.get("irusUKBaseURL");
+ irusUKReportPath = parser.get("irusUKReportPath");
+ sarcsReportPathArray = parser.get("sarcsReportPathArray");
+ sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
+ lareferenciaLogPath = parser.get("lareferenciaLogPath");
+ lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
+ lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
+
+ dbHiveUrl = parser.get("dbHiveUrl");
+ dbImpalaUrl = parser.get("dbImpalaUrl");
+ usageStatsDBSchema = parser.get("usageStatsDBSchema");
+ statsDBSchema = parser.get("statsDBSchema");
+
+ if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
+ recreateDbAndTables = true;
+ else
+ recreateDbAndTables = false;
+
+ if (parser.get("piwikEmptyDirs").toLowerCase().equals("true"))
+ piwikEmptyDirs = true;
+ else
+ piwikEmptyDirs = false;
+
+ if (parser.get("downloadPiwikLogs").toLowerCase().equals("true"))
+ downloadPiwikLogs = true;
+ else
+ downloadPiwikLogs = false;
+/*
+ if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
+ processPiwikLogs = true;
+ else
+ processPiwikLogs = false;
+*/
+ String startingLogPeriodStr = parser.get("startingLogPeriod");
+ Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
+ startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
+
+ String endingLogPeriodStr = parser.get("endingLogPeriod");
+ Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
+ endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
+
+ numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
+ numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
+
+ if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true"))
+ laReferenciaEmptyDirs = true;
+ else
+ laReferenciaEmptyDirs = false;
+
+ if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true"))
+ downloadLaReferenciaLogs = true;
+ else
+ downloadLaReferenciaLogs = false;
+/*
+ if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true"))
+ processLaReferenciaLogs = true;
+ else
+ processLaReferenciaLogs = false;
+*/
+ if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true"))
+ irusCreateTablesEmptyDirs = true;
+ else
+ irusCreateTablesEmptyDirs = false;
+
+ if (parser.get("irusDownloadReports").toLowerCase().equals("true"))
+ irusDownloadReports = true;
+ else
+ irusDownloadReports = false;
+/*
+ if (parser.get("irusProcessStats").toLowerCase().equals("true"))
+ irusProcessStats = true;
+ else
+ irusProcessStats = false;
+ irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
+*/
+ if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true"))
+ sarcCreateTablesEmptyDirs = true;
+ else
+ sarcCreateTablesEmptyDirs = false;
+
+ if (parser.get("sarcDownloadReports").toLowerCase().equals("true"))
+ sarcDownloadReports = true;
+ else
+ sarcDownloadReports = false;
+/*
+ if (parser.get("sarcProcessStats").toLowerCase().equals("true"))
+ sarcProcessStats = true;
+ else
+ sarcProcessStats = false;
+ sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
+*/
+/*
+ if (parser.get("finalizeStats").toLowerCase().equals("true"))
+ finalizeStats = true;
+ else
+ finalizeStats = false;
+ if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
+ finalTablesVisibleToImpala = true;
+ else
+*/ finalTablesVisibleToImpala = false;
+
+ numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
+
+ UsageStatsExporter usagestatsExport = new UsageStatsExporter();
+ usagestatsExport.export();
+ }
+
+ private static Calendar startingLogPeriodStr(Date date) {
+
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(date);
+ return calendar;
+
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java
new file mode 100644
index 000000000..090f76ff5
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java
@@ -0,0 +1,419 @@
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class IrusStats {
+
+ private String irusUKURL;
+
+ private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
+
+ public IrusStats(String irusUKURL) throws Exception {
+ this.irusUKURL = irusUKURL;
+ // The following may not be needed - It will be created when JSON tables are created
+// createTmpTables();
+ }
+
+ public void reCreateLogDirs() throws Exception {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
+
+ logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
+ }
+
+ public void createTables() throws Exception {
+ try {
+ logger.info("Creating sushilog");
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog(source STRING, "
+ + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableSushiLog);
+ logger.info("Created sushilog");
+
+ // To see how to apply to the ignore duplicate rules and indexes
+// stmt.executeUpdate(sqlCreateTableSushiLog);
+// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO sushilog "
+// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+// + "sushilog.rid, sushilog.date "
+// + "FROM sushilog "
+// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+// stmt.executeUpdate(sqlcreateRuleSushiLog);
+// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
+// stmt.executeUpdate(createSushiIndex);
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+// // The following may not be needed - It will be created when JSON tables are created
+// private void createTmpTables() throws Exception {
+// try {
+//
+// Statement stmt = ConnectDB.getConnection().createStatement();
+// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
+// stmt.executeUpdate(sqlCreateTableSushiLog);
+//
+// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
+// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
+// // stmt.executeUpdate(sqlCopyPublicSushiLog);
+// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO sushilogtmp "
+// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
+// + "sushilogtmp.rid, sushilogtmp.date "
+// + "FROM sushilogtmp "
+// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+// stmt.executeUpdate(sqlcreateRuleSushiLog);
+//
+// stmt.close();
+// ConnectDB.getConnection().close();
+// log.info("Sushi Tmp Tables Created");
+// } catch (Exception e) {
+// log.error("Failed to create tables: " + e);
+// throw new Exception("Failed to create tables: " + e.toString(), e);
+// }
+// }
+ public void processIrusStats() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping sushilogtmp_json table");
+ String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilogtmp_json";
+ stmt.executeUpdate(dropSushilogtmpJson);
+ logger.info("Dropped sushilogtmp_json table");
+
+ logger.info("Creating irus_sushilogtmp_json table");
+ String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " Type: STRING,\n"
+ + " Value: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` ARRAY<\n"
+ + " struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(createSushilogtmpJson);
+ logger.info("Created irus_sushilogtmp_json table");
+
+ logger.info("Dropping irus_sushilogtmp table");
+ String dropSushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(dropSushilogtmp);
+ logger.info("Dropped irus_sushilogtmp table");
+
+ logger.info("Creating irus_sushilogtmp table");
+ String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(createSushilogtmp);
+ logger.info("Created irus_sushilogtmp table");
+
+ logger.info("Inserting to irus_sushilogtmp table");
+ String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
+ + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
+ + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
+ + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
+ + "WHERE `ItemIdent`.`Type`= 'OAI'";
+ stmt.executeUpdate(insertSushilogtmp);
+ logger.info("Inserted to irus_sushilogtmp table");
+
+ logger.info("Creating downloads_stats table");
+ String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`result_id` string, "
+ + "`date` string, "
+ + "`count` bigint, "
+ + "`openaire` bigint)";
+ stmt.executeUpdate(createDownloadsStats);
+ logger.info("Created downloads_stats table");
+
+ logger.info("Inserting into downloads_stats");
+ String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT s.source, d.id AS repository_id, "
+ + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, "
+ + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
+ stmt.executeUpdate(insertDStats);
+ logger.info("Inserted into downloads_stats");
+
+ logger.info("Creating sushilog table");
+ String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`rid` string, "
+ + "`date` string, "
+ + "`metric_type` string, "
+ + "`count` int)";
+ stmt.executeUpdate(createSushilog);
+ logger.info("Created sushilog table");
+
+ logger.info("Inserting to sushilog table");
+ String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(insertToShushilog);
+ logger.info("Inserted to sushilog table");
+
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getIrusRRReport(String irusUKReportPath) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
+
+ String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
+ + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
+
+ logger.info("(getIrusRRReport) Getting report: " + reportUrl);
+
+ String text = getJson(reportUrl, "", "");
+
+ List opendoarsToVisit = new ArrayList();
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ int i = 0;
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
+ for (Object identifier : itemIdentifier) {
+ JSONObject opendoar = (JSONObject) identifier;
+ if (opendoar.get("Type").toString().equals("OpenDOAR")) {
+ i++;
+ opendoarsToVisit.add(opendoar.get("Value").toString());
+ break;
+ }
+ }
+ // break;
+ }
+
+ logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
+
+ if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
+ && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ }
+
+ logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
+
+ for (String opendoar : opendoarsToVisit) {
+ logger.info("Now working on openDoar: " + opendoar);
+ this.getIrusIRReport(opendoar, irusUKReportPath);
+ }
+
+ logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
+ }
+
+ private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
+
+ logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
+
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, "opendoar____::" + opendoar);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+ int batch_size = 0;
+
+ if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
+ } else {
+ while (start.before(end)) {
+ logger.info("date: " + simpleDateFormat.format(start.getTime()));
+ String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ + "&RepositoryIdentifier=opendoar%3A" + opendoar
+ + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
+ start.add(Calendar.MONTH, 1);
+
+ logger.info("Downloading file: " + reportUrl);
+ String text = getJson(reportUrl, "", "");
+ if (text == null) {
+ continue;
+ }
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ String filePath = irusUKReportPath + "/" + "IrusIRReport_"
+ + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePath);
+ FSDataOutputStream fin = fs.create(new Path(filePath), true);
+
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ if (jsonArray == null) {
+ continue;
+ }
+ String oai = "";
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRow.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
+
+ fin.close();
+ }
+
+ }
+ //ConnectDB.getHiveConnection().close();
+
+ logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ System.out.println("===> Connecting to: " + url);
+ URL website = new URL(url);
+ System.out.println("Connection url -----> " + url);
+ URLConnection connection = website.openConnection();
+
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+// response.append("\n");
+ }
+ }
+
+ System.out.println("response ====> " + response.toString());
+
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
+
+ private String getJson(String url, String username, String password) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL", e);
+ return null;
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java
new file mode 100644
index 000000000..88550579b
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java
@@ -0,0 +1,265 @@
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class LaReferenciaDownloadLogs {
+
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
+
+ /*
+ * The Piwik's API method
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+ private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
+
+ private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
+
+ public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+ this.createTables();
+// this.createTmpTables();
+ }
+
+ public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
+
+ logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
+ }
+
+ private void createTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialog "
+// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
+// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
+// + "FROM lareferencialog "
+// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
+// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
+// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Lareferencia Tables Created");
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
+
+// private void createTmpTables() throws Exception {
+//
+// try {
+// Statement stmt = ConnectDB.getConnection().createStatement();
+// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
+// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialogtmp "
+// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
+// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
+// + "FROM lareferencialogtmp "
+// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
+// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
+//
+// stmt.close();
+// log.info("Lareferencia Tmp Tables Created");
+//
+// } catch (Exception e) {
+// log.error("Failed to create tmptables: " + e);
+// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
+// // System.exit(0);
+// }
+// }
+ private String getPiwikLogUrl() {
+ return piwikUrl + "/";
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+// response.append("\n");
+ }
+ }
+
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
+
+ public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
+
+ String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
+ String content = "";
+
+ List siteIdsToVisit = new ArrayList();
+
+ // Getting all the siteIds in a list for logging reasons & limiting the list
+ // to the max number of siteIds
+ content = getJson(baseApiUrl);
+ JSONParser parser = new JSONParser();
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
+ }
+ logger.info("Found the following siteIds for download: " + siteIdsToVisit);
+
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
+
+ logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
+
+ for (int siteId : siteIdsToVisit) {
+ logger.info("Now working on LaReferencia MatomoId: " + siteId);
+ this.GetLaReFerenciaLogs(repoLogsPath, siteId);
+ }
+ }
+
+ public void GetLaReFerenciaLogs(String repoLogsPath,
+ int laReferencialMatomoID) throws Exception {
+
+ logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".lareferencialog WHERE matomoid=?");
+ st.setInt(1, laReferencialMatomoID);
+ Date dateMax = null;
+
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ Date date = currDay.getTime();
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID);
+ } else {
+ logger
+ .info(
+ "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
+ + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ outFolder = repoLogsPath;
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
+ true);
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ do {
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRaw.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
+
+ logger
+ .info(
+ "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
+ + " and for "
+ + sdf.format(date));
+ i++;
+ } while (true);
+ fin.close();
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java
new file mode 100644
index 000000000..ef7636099
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java
@@ -0,0 +1,436 @@
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Timestamp;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class LaReferenciaStats {
+
+ private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class);
+
+ private String logRepoPath;
+
+ private Statement stmt = null;
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public LaReferenciaStats(String logRepoPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.createTables();
+// this.createTmpTables();
+ }
+
+ /*
+ * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) {
+ * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } }
+ */
+ private void createTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " +
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
+ "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialog "
+// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
+// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
+// + "FROM lareferencialog "
+// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
+// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
+// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Lareferencia Tables Created");
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
+
+// private void createTmpTables() throws Exception {
+//
+// try {
+// Statement stmt = ConnectDB.getConnection().createStatement();
+// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
+// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialogtmp "
+// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
+// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
+// + "FROM lareferencialogtmp "
+// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
+// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
+//
+// stmt.close();
+// log.info("Lareferencia Tmp Tables Created");
+//
+// } catch (Exception e) {
+// log.error("Failed to create tmptables: " + e);
+// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
+// // System.exit(0);
+// }
+// }
+
+ public void processLogs() throws Exception {
+ try {
+ logger.info("Processing LaReferencia repository logs");
+ processlaReferenciaLog();
+ logger.info("LaReferencia repository logs process done");
+
+ logger.info("LaReferencia removing double clicks");
+ removeDoubleClicks();
+ logger.info("LaReferencia removed double clicks");
+
+ logger.info("LaReferencia creating viewsStats");
+ viewsStats();
+ logger.info("LaReferencia created viewsStats");
+ logger.info("LaReferencia creating downloadsStats");
+ downloadsStats();
+ logger.info("LaReferencia created downloadsStats");
+ logger.info("LaReferencia updating Production Tables");
+ updateProdTables();
+ logger.info("LaReferencia updated Production Tables");
+
+ } catch (Exception e) {
+ logger.error("Failed to process logs: " + e);
+ throw new Exception("Failed to process logs: " + e.toString(), e);
+ }
+ }
+
+ public void processlaReferenciaLog() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping lareferencialogtmp_json table");
+ String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp_json";
+ stmt.executeUpdate(drop_lareferencialogtmp_json);
+ logger.info("Dropped lareferencialogtmp_json table");
+
+ logger.info("Creating lareferencialogtmp_json");
+ String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp_json(\n" +
+ " `idSite` STRING,\n" +
+ " `idVisit` STRING,\n" +
+ " `country` STRING,\n" +
+ " `referrerName` STRING,\n" +
+ " `browser` STRING,\n" +
+ " `repItem` STRING,\n" +
+ " `actionDetails` ARRAY<\n" +
+ " struct<\n" +
+ " timestamp: STRING,\n" +
+ " type: STRING,\n" +
+ " url: STRING,\n" +
+ " `customVariables`: struct<\n" +
+ " `1`: struct<\n" +
+ " `customVariablePageValue1`: STRING\n" +
+ " >,\n" +
+ " `2`: struct<\n" +
+ " `customVariablePageValue2`: STRING\n" +
+ " >\n" +
+ " >\n" +
+ " >\n" +
+ " >" +
+ ")\n" +
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
+ "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" +
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_lareferencialogtmp_json);
+ logger.info("Created lareferencialogtmp_json");
+
+ logger.info("Dropping lareferencialogtmp table");
+ String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp";
+ stmt.executeUpdate(drop_lareferencialogtmp);
+ logger.info("Dropped lareferencialogtmp table");
+
+ logger.info("Creating lareferencialogtmp");
+ String create_lareferencialogtmp = "CREATE TABLE " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " +
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
+ "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_lareferencialogtmp);
+ logger.info("Created lareferencialogtmp");
+
+ logger.info("Inserting into lareferencialogtmp");
+ String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " +
+ "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " +
+ "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " +
+ "actiondetail.type as action, actiondetail.url as url, " +
+ "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
+ "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
+ "referrerName as referrer_name, browser as agent " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " +
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_lareferencialogtmp);
+ logger.info("Inserted into lareferencialogtmp");
+
+ stmt.close();
+ }
+
+ public void removeDoubleClicks() throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning download double clicks");
+ // clean download double clicks
+ String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
+ "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
+ "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
+ "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " +
+ "AND p1.timestamp listHdfsDir(String dir) throws Exception {
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ // log.info("Found hdfs file " + fileName);
+ fileNames.add(fileName);
+ }
+ // hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath));
+ throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ if (!line.equals("[]")) {
+ sb.append(line);
+ }
+ // sb.append(line);
+ line = br.readLine();
+ }
+ result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ if (result.equals("")) {
+ result = "[]";
+ }
+
+ // fs.close();
+ } catch (Exception e) {
+ logger.error(e.getMessage());
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java
new file mode 100644
index 000000000..681105de4
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java
@@ -0,0 +1,327 @@
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+import java.net.Authenticator;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class PiwikDownloadLogs {
+
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
+
+ /*
+ * The Piwik's API method
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
+
+ public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+
+ }
+
+ private String getPiwikLogUrl() {
+ return "https://" + piwikUrl + "/";
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ logger.debug("Connecting to download the JSON: " + url);
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + url + " Exception: " + e);
+ throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
+ }
+ }
+
+ class WorkerThread implements Runnable {
+
+ private Calendar currDay;
+ private int siteId;
+ private String repoLogsPath;
+ private String portalLogPath;
+ private String portalMatomoID;
+
+ public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws IOException {
+ this.currDay = (Calendar) currDay.clone();
+ this.siteId = new Integer(siteId);
+ this.repoLogsPath = new String(repoLogsPath);
+ this.portalLogPath = new String(portalLogPath);
+ this.portalMatomoID = new String(portalMatomoID);
+ }
+
+ public void run() {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Start) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ try {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (End) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ }
+
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
+
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
+
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
+
+ i++;
+ } while (true);
+
+ fs.close();
+ }
+ }
+
+ public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
+
+ Statement statement = ConnectDB.getHiveConnection().createStatement();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ ResultSet rs = statement
+ .executeQuery(
+ "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
+
+ // Getting all the piwikids in a list for logging reasons & limitting the list
+ // to the max number of piwikids
+ List piwikIdToVisit = new ArrayList();
+ //while (rs.next())
+ //piwikIdToVisit.add(rs.getInt(1));
+ piwikIdToVisit.add(13);
+
+ logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
+
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
+ logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
+
+ logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
+
+ // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
+ for (int siteId : piwikIdToVisit) {
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+
+ logger.info("Now working on piwikId: " + siteId);
+
+ PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog WHERE source=?");
+ st.setInt(1, siteId);
+ Date dateMax = null;
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
+
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ // logger.info("Date used " + currDay.toString());
+ // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ // executor.execute(worker);// calling execute method of ExecutorService
+ logger.info("Date used " + currDay.getTime().toString());
+
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
+ } else {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ }
+
+ }
+ }
+ // executor.shutdown();
+ // while (!executor.isTerminated()) {
+ // }
+ // System.out.println("Finished all threads");
+ }
+
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
+
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
+
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
+
+ i++;
+ } while (true);
+
+ fs.close();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
new file mode 100644
index 000000000..e0225d49a
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
@@ -0,0 +1,1262 @@
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class PiwikStatsDB {
+
+ private String logPath;
+ private String logRepoPath;
+ private String logPortalPath;
+
+ private Statement stmt = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.logPortalPath = logPortalPath;
+
+ }
+
+ public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
+
+ logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
+
+ logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
+
+ logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
+ }
+
+ public void recreateDBAndTables() throws Exception {
+ this.createDatabase();
+ this.createTables();
+ // The piwiklog table is not needed since it is built
+ // on top of JSON files
+ this.createTmpTables();
+ }
+
+ public ArrayList getRobotsList() {
+ return robotsList;
+ }
+
+ public void setRobotsList(ArrayList robotsList) {
+ this.robotsList = robotsList;
+ }
+
+ public String getCounterRobotsURL() {
+ return CounterRobotsURL;
+ }
+
+ public void setCounterRobotsURL(String CounterRobotsURL) {
+ this.CounterRobotsURL = CounterRobotsURL;
+ }
+
+ private void createDatabase() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+ String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
+ stmt.executeUpdate(dropDatabase);
+ } catch (Exception e) {
+ logger.error("Failed to drop database: " + e);
+ throw new Exception("Failed to drop database: " + e.toString(), e);
+ }
+
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
+ }
+ }
+
+ private void createTables() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ // Create Piwiklog table - This table should exist
+ String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) "
+ + "into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePiwikLog);
+
+ /////////////////////////////////////////
+ // Rule for duplicate inserts @ piwiklog
+ /////////////////////////////////////////
+
+ String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePortalLog);
+
+ //////////////////////////////////////////////////
+ // Rule for duplicate inserts @ process_portal_log
+ //////////////////////////////////////////////////
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ private void createTmpTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
+
+ //////////////////////////////////////////////////
+ // Rule for duplicate inserts @ piwiklogtmp
+ //////////////////////////////////////////////////
+
+ //////////////////////////////////////////////////
+ // Copy from public.piwiklog to piwiklog
+ //////////////////////////////////////////////////
+ // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
+ // stmt.executeUpdate(sqlCopyPublicPiwiklog);
+
+ String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTmpTablePortalLog);
+
+ //////////////////////////////////////////////////
+ // Rule for duplicate inserts @ process_portal_log_tmp
+ //////////////////////////////////////////////////
+
+ stmt.close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tmptables: " + e);
+ throw new Exception("Failed to create tmp tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
+
+ public void processLogs() throws Exception {
+ try {
+ ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
+ this.robotsList = counterRobots.getRobotsPatterns();
+
+ logger.info("Processing repository logs");
+ processRepositoryLog();
+ logger.info("Repository logs process done");
+
+ logger.info("Removing double clicks");
+ removeDoubleClicks();
+ logger.info("Removing double clicks done");
+
+ logger.info("Cleaning oai");
+ cleanOAI();
+ logger.info("Cleaning oai done");
+
+ logger.info("Processing portal logs");
+ processPortalLog();
+ logger.info("Portal logs process done");
+
+ logger.info("Processing portal usagestats");
+ portalStats();
+ logger.info("Portal usagestats process done");
+
+ logger.info("ViewsStats processing starts");
+ viewsStats();
+ logger.info("ViewsStats processing ends");
+
+ logger.info("DownloadsStats processing starts");
+ downloadsStats();
+ logger.info("DownloadsStats processing starts");
+
+ logger.info("Updating Production Tables");
+ updateProdTables();
+ logger.info("Updated Production Tables");
+
+ } catch (Exception e) {
+ logger.error("Failed to process logs: " + e);
+ throw new Exception("Failed to process logs: " + e.toString(), e);
+ }
+ }
+
+ public void processRepositoryLog() throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping piwiklogtmp_json table");
+ String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".piwiklogtmp_json";
+ stmt.executeUpdate(drop_piwiklogtmp_json);
+ logger.info("Dropped piwiklogtmp_json table");
+
+ logger.info("Creating piwiklogtmp_json");
+ String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".piwiklogtmp_json(\n" +
+ " `idSite` STRING,\n" +
+ " `idVisit` STRING,\n" +
+ " `country` STRING,\n" +
+ " `referrerName` STRING,\n" +
+ " `browser` STRING,\n" +
+ " `actionDetails` ARRAY<\n" +
+ " struct<\n" +
+ " type: STRING,\n" +
+ " url: STRING,\n" +
+ " `customVariables`: struct<\n" +
+ " `1`: struct<\n" +
+ " `customVariablePageValue1`: STRING\n" +
+ " >\n" +
+ " >,\n" +
+ " timestamp: String\n" +
+ " >\n" +
+ " >\n" +
+ ")\n" +
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
+ "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" +
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_piwiklogtmp_json);
+ logger.info("Created piwiklogtmp_json");
+
+ logger.info("Dropping piwiklogtmp table");
+ String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".piwiklogtmp";
+ stmt.executeUpdate(drop_piwiklogtmp);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Creating piwiklogtmp");
+ String create_piwiklogtmp = "CREATE TABLE " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " +
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_piwiklogtmp);
+ logger.info("Created piwiklogtmp");
+
+ logger.info("Inserting into piwiklogtmp");
+ String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " +
+ "actiondetail.type as action, actiondetail.url as url, " +
+ "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
+ "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
+ "referrerName as referrer_name, browser as agent\n" +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" +
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_piwiklogtmp);
+ logger.info("Inserted into piwiklogtmp");
+
+ stmt.close();
+ }
+
+ public void removeDoubleClicks() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning download double clicks");
+ // clean download double clicks
+ String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "WHERE EXISTS (\n" +
+ "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
+ ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
+ "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ +
+ "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" +
+ "AND p1.timestamp\n" +
+ " >\n" +
+ ")\n" +
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
+ "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" +
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_process_portal_log_tmp_json);
+ logger.info("Created process_portal_log_tmp_json");
+
+ logger.info("Droping process_portal_log_tmp table");
+ String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".process_portal_log_tmp";
+ stmt.executeUpdate(drop_process_portal_log_tmp);
+ logger.info("Dropped process_portal_log_tmp");
+
+ logger.info("Creating process_portal_log_tmp");
+ String create_process_portal_log_tmp = "CREATE TABLE " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " +
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_process_portal_log_tmp);
+ logger.info("Created process_portal_log_tmp");
+
+ logger.info("Inserting into process_portal_log_tmp");
+ String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp " +
+ "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+ +
+ "actiondetail.url as url, " +
+ "CASE\n" +
+ " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " +
+ " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " +
+ " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
+ +
+ " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " +
+ " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " +
+ " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " +
+ " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " +
+ " ELSE '' " +
+ "END AS entity_id, " +
+ "CASE " +
+ " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " +
+ " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " +
+ " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " +
+ " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " +
+ " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " +
+ " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " +
+ " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " +
+ " ELSE '' " +
+ "END AS source_item_type, " +
+ "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " +
+ "browser as agent " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " +
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_process_portal_log_tmp);
+ logger.info("Inserted into process_portal_log_tmp");
+
+ stmt.close();
+ }
+
+ public void portalStats() throws SQLException {
+ Connection con = ConnectDB.getHiveConnection();
+ Statement stmt = con.createStatement();
+ con.setAutoCommit(false);
+
+// Original queries where of the style
+//
+// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent
+// FROM usagestats_20200907.process_portal_log_tmp2,
+// openaire_prod_stats_20200821.result_oids roid
+// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null
+//
+// The following query is an example of how queries should be
+//
+//
+// INSERT INTO usagestats_20200907.piwiklogtmp
+// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
+// FROM usagestats_20200907.process_portal_log_tmp
+// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id
+// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL);
+//
+// We should consider if we would like the queries to be as the following
+//
+// INSERT INTO usagestats_20200907.piwiklogtmp
+// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
+// FROM usagestats_20200907.process_portal_log_tmp
+// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id
+// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND
+// roid.oid != '');
+
+ logger.info("PortalStats - Step 1");
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+ +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".result_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("PortalStats - Step 2");
+ stmt = con.createStatement();
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+ +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".datasource_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ /*
+ * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " +
+ * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent "
+ * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
+ * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
+ * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() +
+ * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close();
+ */
+ logger.info("PortalStats - Step 3");
+ stmt = con.createStatement();
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+ +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".project_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ con.close();
+ }
+
+ private void cleanOAI() throws Exception {
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning oai - Step 1");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," +
+ "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 2");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," +
+ "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 3");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," +
+ "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 4");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," +
+ "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 5");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," +
+ "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 6");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," +
+ "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 7");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," +
+ "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 8");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," +
+ "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 9");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," +
+ "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 10");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," +
+ "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 11");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," +
+ "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 12");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," +
+ "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 13");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," +
+ "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 14");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," +
+ "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 15");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," +
+ "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 16");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," +
+ "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 17");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," +
+ "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 18");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," +
+ "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 19");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," +
+ "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 20");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," +
+ "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 21");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," +
+ "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 22");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," +
+ "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 23");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," +
+ "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 24");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," +
+ "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 25");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," +
+ "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 26");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," +
+ "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 27");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," +
+ "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 28");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," +
+ "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 29");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," +
+ "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Done, closing connection");
+ ConnectDB.getHiveConnection().close();
+ }
+
+ private String processPortalURL(String url) {
+
+ if (url.indexOf("explore.openaire.eu") > 0) {
+ try {
+ url = URLDecoder.decode(url, "UTF-8");
+ } catch (Exception e) {
+ logger.info("Error when decoding the following URL: " + url);
+ }
+ if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
+ url = "datasource|"
+ + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
+ } else if (url.indexOf("datasource=") > 0
+ && url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
+ url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
+ } else if (url.indexOf("datasourceFilter=") > 0
+ && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
+ url = "datasource|"
+ + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
+ } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
+ url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
+ } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
+ url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
+ } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46
+ && !url.contains("oai:dnet:corda")) {
+ url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
+ } else if (url.indexOf("organizationId=") > 0
+ && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
+ url = "organization|"
+ + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
+ } else {
+ url = "";
+ }
+ } else {
+ url = "";
+ }
+
+ return url;
+ }
+
+ private void updateProdTables() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Inserting data to piwiklog");
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " +
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ stmt.executeUpdate(sql);
+
+ logger.info("Inserting data to views_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp";
+ stmt.executeUpdate(sql);
+
+ logger.info("Inserting data to downloads_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp";
+ stmt.executeUpdate(sql);
+
+ logger.info("Inserting data to pageviews_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " +
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp";
+ stmt.executeUpdate(sql);
+
+ logger.info("Creating usage_stats table");
+ String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
+ "AS SELECT coalesce(ds.source, vs.source) as source, " +
+ "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
+ "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
+ "coalesce(ds.openaire, 0) as openaire_downloads, " +
+ "coalesce(vs.openaire, 0) as openaire_views " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
+ ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
+ "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
+ stmt.executeUpdate(createUsageStats);
+ logger.info("Created usage_stats table");
+
+
+ /*
+ * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
+ * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql);
+ * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
+ * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql);
+ * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
+ * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql);
+ * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " +
+ * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql);
+ */
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ }
+
+ private ArrayList listHdfsDir(String dir) throws Exception {
+
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ fileNames.add(fileName);
+ }
+
+ hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath));
+ throw new Exception("HDFS file path with exported data does not exist : " + logPath, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ if (!line.equals("[]")) {
+ sb.append(line);
+ }
+ // sb.append(line);
+ line = br.readLine();
+ }
+ result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ if (result.equals("")) {
+ result = "[]";
+ }
+
+ // fs.close();
+ } catch (Exception e) {
+ logger.error(e.getMessage());
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+ private Connection getConnection() throws SQLException {
+ return ConnectDB.getHiveConnection();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java
new file mode 100644
index 000000000..6f020daa0
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java
@@ -0,0 +1,54 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+
+import org.json.JSONException;
+import org.json.simple.JSONArray;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+public class ReadCounterRobotsList {
+
+ private ArrayList robotsPatterns = new ArrayList();
+ private String COUNTER_ROBOTS_URL;
+
+ public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException {
+ COUNTER_ROBOTS_URL = url;
+ robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL);
+ }
+
+ private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException {
+ InputStream is = new URL(url).openStream();
+ JSONParser parser = new JSONParser();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1")));
+ JSONArray jsonArray = (JSONArray) parser.parse(reader);
+ for (Object aJsonArray : jsonArray) {
+ org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray;
+ robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\"));
+ }
+ return robotsPatterns;
+ }
+
+ public ArrayList getRobotsPatterns() {
+ return robotsPatterns;
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java
new file mode 100644
index 000000000..54ed286cb
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java
@@ -0,0 +1,575 @@
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.*;
+// import java.io.BufferedReader;
+// import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class SarcStats {
+
+ private Statement stmtHive = null;
+ private Statement stmtImpala = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
+
+ public SarcStats() throws Exception {
+// createTables();
+ }
+
+ private void createTables() throws Exception {
+ try {
+
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
+ stmtHive.executeUpdate(sqlCreateTableSushiLog);
+
+ // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
+ // stmt.executeUpdate(sqlCopyPublicSushiLog);
+ String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ + " ON INSERT TO sushilog "
+ + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ + "sushilog.rid, sushilog.date "
+ + "FROM sushilog "
+ + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+ stmtHive.executeUpdate(sqlcreateRuleSushiLog);
+ String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
+ stmtHive.executeUpdate(createSushiIndex);
+
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ public void reCreateLogDirs() throws IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
+
+ logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
+
+ logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
+
+ logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
+ }
+
+ public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping sarc_sushilogtmp_json_array table");
+ String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
+ logger.info("Dropped sarc_sushilogtmp_json_array table");
+
+ logger.info("Creating sarc_sushilogtmp_json_array table");
+ String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
+ logger.info("Created sarc_sushilogtmp_json_array table");
+
+ logger.info("Dropping sarc_sushilogtmp_json_non_array table");
+ String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
+ logger.info("Dropped sarc_sushilogtmp_json_non_array table");
+
+ logger.info("Creating sarc_sushilogtmp_json_non_array table");
+ String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
+ + " `ItemIdentifier` struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
+ logger.info("Created sarc_sushilogtmp_json_non_array table");
+
+ logger.info("Creating sarc_sushilogtmp table");
+ String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_sarc_sushilogtmp);
+ logger.info("Created sarc_sushilogtmp table");
+
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+ String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "WHERE `ItemIdent`.`Type`='DOI'";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+ insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating sushilog table");
+ String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog "
+ + "(`source` string, "
+ + "`repository` string, "
+ + "`rid` string, "
+ + "`date` string, "
+ + "`metric_type` string, "
+ + "`count` int)";
+ stmt.executeUpdate(createSushilog);
+ logger.info("Created sushilog table");
+
+ logger.info("Dropping sarc_sushilogtmp table");
+ String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp";
+ stmt.executeUpdate(drop_sarc_sushilogtmp);
+ logger.info("Dropped sarc_sushilogtmp table");
+ ConnectDB.getHiveConnection().close();
+
+ List issnAndUrls = new ArrayList();
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
+ });
+ issnAndUrls.add(new String[]{
+ "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
+ });
+ issnAndUrls.add(new String[]{
+ "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
+ });
+
+ if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
+ && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ }
+
+ logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
+
+ for (String[] issnAndUrl : issnAndUrls) {
+ logger.info("Now working on ISSN: " + issnAndUrl[1]);
+ getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
+ }
+
+ }
+
+ public void finalizeSarcStats() throws Exception {
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+ stmtImpala = ConnectDB.getImpalaConnection().createStatement();
+
+ logger.info("Creating downloads_stats table_tmp");
+ String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats_tmp "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`result_id` string, "
+ + "`date` string, "
+ + "`count` bigint, "
+ + "`openaire` bigint)";
+ stmtHive.executeUpdate(createDownloadsStats);
+ logger.info("Created downloads_stats_tmp table");
+
+ logger.info("Dropping sarc_sushilogtmp_impala table");
+ String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_impala";
+ stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala);
+ logger.info("Dropped sarc_sushilogtmp_impala table");
+
+ logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala");
+ String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_impala "
+ + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmtHive.executeUpdate(createSarcSushilogtmpImpala);
+ logger.info("Created sarc_sushilogtmp_impala");
+
+ logger.info("Making sarc_sushilogtmp visible to impala");
+ String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_impala;";
+ stmtImpala.executeUpdate(invalidateMetadata);
+
+ logger.info("Dropping downloads_stats_impala table");
+ String drop_downloads_stats_impala = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats_impala";
+ stmtHive.executeUpdate(drop_downloads_stats_impala);
+ logger.info("Dropped downloads_stats_impala table");
+
+ logger.info("Making downloads_stats_impala deletion visible to impala");
+ try {
+ String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats_impala;";
+ stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala);
+ } catch (SQLException sqle) {
+ }
+
+ // We run the following query in Impala because it is faster
+ logger.info("Creating downloads_stats_impala");
+ String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats_impala AS "
+ + "SELECT s.source, d.id AS repository_id, "
+ + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
+ + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
+ + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ + ConnectDB.getStatsDBSchema() + ".result_pids ro "
+ + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
+ stmtImpala.executeUpdate(createDownloadsStatsImpala);
+ logger.info("Creating downloads_stats_impala");
+
+ // Insert into downloads_stats
+ logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp");
+ String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats_tmp SELECT * "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala";
+ stmtHive.executeUpdate(insertDStats);
+ logger.info("Inserted into downloads_stats_tmp");
+
+ logger.info("Creating sushilog table");
+ String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`rid` string, "
+ + "`date` string, "
+ + "`metric_type` string, "
+ + "`count` int)";
+ stmtHive.executeUpdate(createSushilog);
+ logger.info("Created sushilog table");
+
+ // Insert into sushilog
+ logger.info("Inserting into sushilog");
+ String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmtHive.executeUpdate(insertSushiLog);
+ logger.info("Inserted into sushilog");
+
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
+ String url, String issn) throws Exception {
+ logger.info("Processing SARC! issn: " + issn + " with url: " + url);
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, issn);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ // Creating the needed configuration for the correct storing of data
+ Configuration config = new Configuration();
+ config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
+ config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
+ config
+ .set(
+ "fs.hdfs.impl",
+ org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ config
+ .set(
+ "fs.file.impl",
+ org.apache.hadoop.fs.LocalFileSystem.class.getName());
+ FileSystem dfs = FileSystem.get(config);
+
+ if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
+ } else {
+
+ while (start.before(end)) {
+ String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
+ start.add(Calendar.MONTH, 1);
+
+ logger.info("(getARReport) Getting report: " + reportUrl);
+ String text = getJson(reportUrl);
+ if (text == null) {
+ continue;
+ }
+
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = null;
+ try {
+ jsonObject = (JSONObject) parser.parse(text);
+ } // if there is a parsing error continue with the next url
+ catch (ParseException pe) {
+ continue;
+ }
+
+ jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("sc:Report");
+ if (jsonObject == null) {
+ continue;
+ }
+ jsonObject = (JSONObject) jsonObject.get("c:Report");
+ jsonObject = (JSONObject) jsonObject.get("c:Customer");
+ Object obj = jsonObject.get("c:ReportItems");
+ JSONArray jsonArray = new JSONArray();
+ if (obj instanceof JSONObject) {
+ jsonArray.add(obj);
+ } else {
+ jsonArray = (JSONArray) obj;
+ // jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
+ }
+ if (jsonArray == null) {
+ continue;
+ }
+
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathArray);
+ FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
+
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathNonArray);
+ FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
+
+ for (Object aJsonArray : jsonArray) {
+
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ renameKeysRecursively(":", jsonObjectRow);
+
+ if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
+ finNonArray.write(jsonObjectRow.toJSONString().getBytes());
+ finNonArray.writeChar('\n');
+ } else {
+ finArray.write(jsonObjectRow.toJSONString().getBytes());
+ finArray.writeChar('\n');
+ }
+ }
+
+ finArray.close();
+ finNonArray.close();
+
+ // Check the file size and if it is too big, delete it
+ File fileArray = new File(filePathArray);
+ if (fileArray.length() == 0)
+ fileArray.delete();
+ File fileNonArray = new File(filePathNonArray);
+ if (fileNonArray.length() == 0)
+ fileNonArray.delete();
+
+ }
+
+ dfs.close();
+ }
+ //ConnectDB.getHiveConnection().close();
+ }
+
+ private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
+ for (Object jjval : givenJsonObj) {
+ if (jjval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jjval);
+ } else if (jjval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jjval);
+ } // All other types of vals
+ else
+ ;
+ }
+ }
+
+ private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
+ Set jkeys = new HashSet(givenJsonObj.keySet());
+ for (String jkey : jkeys) {
+
+ String[] splitArray = jkey.split(delimiter);
+ String newJkey = splitArray[splitArray.length - 1];
+
+ Object jval = givenJsonObj.get(jkey);
+ givenJsonObj.remove(jkey);
+ givenJsonObj.put(newJkey, jval);
+
+ if (jval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jval);
+ }
+
+ if (jval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jval);
+ }
+ }
+ }
+
+ private String getJson(String url) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+
+ // Logging error and silently continuing
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
+// return null;
+// throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ return "";
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java
new file mode 100644
index 000000000..c4ee7d63c
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java
@@ -0,0 +1,186 @@
+
+package eu.dnetlib.oa.graph.usagerawdata.export;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Main class for downloading and processing Usage statistics
+ *
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class UsageStatsExporter {
+
+ public UsageStatsExporter() {
+
+ }
+
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ private void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
+
+ logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
+
+ logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
+
+ logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
+
+ logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
+
+ logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
+ }
+
+ public void export() throws Exception {
+
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
+
+// runImpalaQuery();
+
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+
+ logger.info("Re-creating database and tables");
+ if (ExecuteWorkflow.recreateDbAndTables)
+ piwikstatsdb.recreateDBAndTables();
+ ;
+
+ logger.info("Initializing the download logs module");
+ PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
+
+ if (ExecuteWorkflow.piwikEmptyDirs) {
+ logger.info("Recreating Piwik log directories");
+ piwikstatsdb.reCreateLogDirs();
+ }
+
+ // Downloading piwik logs (also managing directory creation)
+ if (ExecuteWorkflow.downloadPiwikLogs) {
+ logger.info("Downloading piwik logs");
+ piwd
+ .GetOpenAIRELogs(
+ ExecuteWorkflow.repoLogPath,
+ ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
+ }
+ logger.info("Downloaded piwik logs");
+/*
+ // Create DB tables, insert/update statistics
+ String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
+ piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
+
+ if (ExecuteWorkflow.processPiwikLogs) {
+ logger.info("Processing logs");
+ piwikstatsdb.processLogs();
+ }
+*/
+ logger.info("Creating LaReferencia tables");
+ LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
+ ExecuteWorkflow.lareferenciaAuthToken);
+
+ if (ExecuteWorkflow.laReferenciaEmptyDirs) {
+ logger.info("Recreating LaReferencia log directories");
+ lrf.reCreateLogDirs();
+ }
+
+ if (ExecuteWorkflow.downloadLaReferenciaLogs) {
+ logger.info("Downloading LaReferencia logs");
+ lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
+ logger.info("Downloaded LaReferencia logs");
+ }
+/*
+ LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
+
+ if (ExecuteWorkflow.processLaReferenciaLogs) {
+ logger.info("Processing LaReferencia logs");
+ lastats.processLogs();
+ logger.info("LaReferencia logs done");
+ }
+*/
+ IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
+ if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
+ logger.info("Creating Irus Stats tables");
+ irusstats.createTables();
+ logger.info("Created Irus Stats tables");
+
+ logger.info("Re-create log dirs");
+ irusstats.reCreateLogDirs();
+ logger.info("Re-created log dirs");
+ }
+
+ if (ExecuteWorkflow.irusDownloadReports) {
+ irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
+ }
+/*
+ if (ExecuteWorkflow.irusProcessStats) {
+ irusstats.processIrusStats();
+ logger.info("Irus done");
+ }
+*/
+ SarcStats sarcStats = new SarcStats();
+ if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
+ sarcStats.reCreateLogDirs();
+ }
+ if (ExecuteWorkflow.sarcDownloadReports) {
+ sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ }
+/*
+ if (ExecuteWorkflow.sarcProcessStats) {
+ sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ sarcStats.finalizeSarcStats();
+ }
+ logger.info("Sarc done");
+*/
+
+/*
+ // finalize usagestats
+ if (ExecuteWorkflow.finalizeStats) {
+ piwikstatsdb.finalizeStats();
+ logger.info("Finalized stats");
+ }
+*/
+
+/*
+ // Make the tables available to Impala
+ if (ExecuteWorkflow.finalTablesVisibleToImpala) {
+ logger.info("Making tables visible to Impala");
+ invalidateMetadata();
+ }
+*/
+ logger.info("End");
+ }
+
+ private void invalidateMetadata() throws SQLException {
+ Statement stmt = null;
+
+ stmt = ConnectDB.getImpalaConnection().createStatement();
+
+ String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json
new file mode 100644
index 000000000..988c23b48
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json
@@ -0,0 +1,231 @@
+[
+ {
+ "paramName": "mat",
+ "paramLongName": "matomoAuthToken",
+ "paramDescription": "when true will stop SparkSession after job execution",
+ "paramRequired": false
+ },
+ {
+ "paramName": "mbu",
+ "paramLongName": "matomoBaseURL",
+ "paramDescription": "URL of the isLookUp Service",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rlp",
+ "paramLongName": "repoLogPath",
+ "paramDescription": "nameNode of the source cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plp",
+ "paramLongName": "portalLogPath",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pmi",
+ "paramLongName": "portalMatomoID",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "iukbuw",
+ "paramLongName": "irusUKBaseURL",
+ "paramDescription": "working directory",
+ "paramRequired": true
+ },
+ {
+ "paramName": "iukrp",
+ "paramLongName": "irusUKReportPath",
+ "paramDescription": "maximum number of map tasks used in the distcp process",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpa",
+ "paramLongName": "sarcsReportPathArray",
+ "paramDescription": "memory for distcp action copying actionsets from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpna",
+ "paramLongName": "sarcsReportPathNonArray",
+ "paramDescription": "timeout for distcp copying actions from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "llp",
+ "paramLongName": "lareferenciaLogPath",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lbu",
+ "paramLongName": "lareferenciaBaseURL",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lat",
+ "paramLongName": "lareferenciaAuthToken",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbhu",
+ "paramLongName": "dbHiveUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbiu",
+ "paramLongName": "dbImpalaUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "usdbs",
+ "paramLongName": "usageStatsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "sdbs",
+ "paramLongName": "statsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rdbt",
+ "paramLongName": "recreateDbAndTables",
+ "paramDescription": "Re-create database and initial tables?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pwed",
+ "paramLongName": "piwikEmptyDirs",
+ "paramDescription": "Empty piwik directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ppwl",
+ "paramLongName": "processPiwikLogs",
+ "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dpwl",
+ "paramLongName": "downloadPiwikLogs",
+ "paramDescription": "download piwik logs?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "slp",
+ "paramLongName": "startingLogPeriod",
+ "paramDescription": "Starting log period",
+ "paramRequired": true
+ },
+ {
+ "paramName": "elp",
+ "paramLongName": "endingLogPeriod",
+ "paramDescription": "Ending log period",
+ "paramRequired": true
+ },
+ {
+ "paramName": "npidd",
+ "paramLongName": "numberOfPiwikIdsToDownload",
+ "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nsidd",
+ "paramLongName": "numberOfSiteIdsToDownload",
+ "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lerd",
+ "paramLongName": "laReferenciaEmptyDirs",
+ "paramDescription": "Empty LaReferencia directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plrl",
+ "paramLongName": "processLaReferenciaLogs",
+ "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dlrl",
+ "paramLongName": "downloadLaReferenciaLogs",
+ "paramDescription": "download La Referencia logs?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "icted",
+ "paramLongName": "irusCreateTablesEmptyDirs",
+ "paramDescription": "Irus section: Create tables and empty JSON directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "idr",
+ "paramLongName": "irusDownloadReports",
+ "paramDescription": "Irus section: Download reports?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "irusProcessStats",
+ "paramDescription": "Irus section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "inod",
+ "paramLongName": "irusNumberOfOpendoarsToDownload",
+ "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "icted",
+ "paramLongName": "sarcCreateTablesEmptyDirs",
+ "paramDescription": "Sarc section: Create tables and empty JSON directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "idr",
+ "paramLongName": "sarcDownloadReports",
+ "paramDescription": "Sarc section: Download reports?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "sarcProcessStats",
+ "paramDescription": "Sarc section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "inod",
+ "paramLongName": "sarcNumberOfIssnToDownload",
+ "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "fs",
+ "paramLongName": "finalizeStats",
+ "paramDescription": "Create the usage_stats table?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ftvi",
+ "paramLongName": "finalTablesVisibleToImpala",
+ "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nodt",
+ "paramLongName": "numberOfDownloadThreads",
+ "paramDescription": "Number of download threads",
+ "paramRequired": true
+ }
+]
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml
new file mode 100644
index 000000000..b5c807378
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml
@@ -0,0 +1,38 @@
+
+
+ jobTracker
+ ${jobTracker}
+
+
+ nameNode
+ ${nameNode}
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hiveMetastoreUris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ hiveJdbcUrl
+ jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1
+
+
+ impalaJdbcUrl
+ jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl;
+
+
+ oozie.wf.workflow.notification.url
+ {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status
+
+
+ oozie.use.system.libpath
+ true
+
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml
new file mode 100644
index 000000000..a6600516d
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml
@@ -0,0 +1,90 @@
+
+
+
+ hiveMetastoreUris
+ Hive server metastore URIs
+
+
+ hiveJdbcUrl
+ Hive server jdbc url
+
+
+ impalaJdbcUrl
+ Impala server jdbc url
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ hive.metastore.uris
+ ${hiveMetastoreUris}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ eu.dnetlib.oa.graph.usagerawdata.export.ExecuteWorkflow
+ --matomoAuthToken${matomoAuthToken}
+ --matomoBaseURL${matomoBaseURL}
+ --repoLogPath${repoLogPath}
+ --portalLogPath${portalLogPath}
+ --portalMatomoID${portalMatomoID}
+ --irusUKBaseURL${irusUKBaseURL}
+ --irusUKReportPath${irusUKReportPath}
+ --sarcsReportPathArray${sarcsReportPathArray}
+ --sarcsReportPathNonArray${sarcsReportPathNonArray}
+ --lareferenciaLogPath${lareferenciaLogPath}
+ --lareferenciaBaseURL${lareferenciaBaseURL}
+ --lareferenciaAuthToken${lareferenciaAuthToken}
+ --dbHiveUrl${hiveJdbcUrl}
+ --dbImpalaUrl${impalaJdbcUrl}
+ --usageStatsDBSchema${usageStatsDBSchema}
+ --statsDBSchema${statsDBSchema}
+ --recreateDbAndTables${recreateDbAndTables}
+ --piwikEmptyDirs${piwikEmptyDirs}
+ --downloadPiwikLogs${downloadPiwikLogs}
+ --processPiwikLogs${processPiwikLogs}
+ --startingLogPeriod${startingLogPeriod}
+ --endingLogPeriod${endingLogPeriod}
+ --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload}
+ --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload}
+ --laReferenciaEmptyDirs${laReferenciaEmptyDirs}
+ --downloadLaReferenciaLogs${downloadLaReferenciaLogs}
+ --processLaReferenciaLogs${processLaReferenciaLogs}
+ --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs}
+ --irusDownloadReports${irusDownloadReports}
+ --irusProcessStats${irusProcessStats}
+ --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload}
+ --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs}
+ --sarcDownloadReports${sarcDownloadReports}
+ --sarcProcessStats${sarcProcessStats}
+ --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
+ --finalizeStats${finalizeStats}
+ --finalTablesVisibleToImpala${finalTablesVisibleToImpala}
+ --numberOfDownloadThreads${numberOfDownloadThreads}
+
+
+
+
+
+
+
+
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java
index 6947381c9..011c90532 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java
@@ -174,7 +174,7 @@ public class IrusStats {
+ "WHERE `ItemIdent`.`Type`= 'OAI'";
stmt.executeUpdate(insertSushilogtmp);
logger.info("Inserted to irus_sushilogtmp table");
-
+
logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats "
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java
index 347d3de21..747b5ce0e 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java
@@ -357,6 +357,19 @@ public class LaReferenciaStats {
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
+ logger.info("Inserting data to usage_stats from lareferencia");
+ sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
+ "SELECT coalesce(ds.source, vs.source) as source, " +
+ "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
+ "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
+ "coalesce(ds.openaire, 0) as openaire_downloads, " +
+ "coalesce(vs.openaire, 0) as openaire_views " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " +
+ ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " +
+ "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
+ stmt.executeUpdate(sql);
+ logger.info("Inserted data to usage_stats from lareferencia");
// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;";
// stmt.executeUpdate(sql);
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java
index 7a64e48d2..8f7fffa9f 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java
@@ -1,4 +1,3 @@
-
package eu.dnetlib.oa.graph.usagestats.export;
import java.io.*;
@@ -31,293 +30,296 @@ import org.slf4j.LoggerFactory;
*/
public class PiwikDownloadLogs {
- private final String piwikUrl;
- private Date startDate;
- private final String tokenAuth;
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
- /*
+ /*
* The Piwik's API method
- */
- private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
- private final String format = "&format=json";
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
- private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
+ private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
- public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
- this.piwikUrl = piwikUrl;
- this.tokenAuth = tokenAuth;
+ public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
- }
+ }
- private String getPiwikLogUrl() {
- return "https://" + piwikUrl + "/";
- }
+ private String getPiwikLogUrl() {
+ return "https://" + piwikUrl + "/";
+ }
- private String getJson(String url) throws Exception {
- try {
- logger.debug("Connecting to download the JSON: " + url);
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
+ private String getJson(String url) throws Exception {
+ try {
+ logger.debug("Connecting to download the JSON: " + url);
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- }
- }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + url + " Exception: " + e);
- throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
- }
- }
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + url + " Exception: " + e);
+ throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
+ }
+ }
- class WorkerThread implements Runnable {
- private Calendar currDay;
- private int siteId;
- private String repoLogsPath;
- private String portalLogPath;
- private String portalMatomoID;
+ class WorkerThread implements Runnable {
- public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws IOException {
- this.currDay = (Calendar) currDay.clone();
- this.siteId = new Integer(siteId);
- this.repoLogsPath = new String(repoLogsPath);
- this.portalLogPath = new String(portalLogPath);
- this.portalMatomoID = new String(portalMatomoID);
- }
+ private Calendar currDay;
+ private int siteId;
+ private String repoLogsPath;
+ private String portalLogPath;
+ private String portalMatomoID;
- public void run() {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- System.out
- .println(
- Thread.currentThread().getName() + " (Start) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
- ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
- ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- try {
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws IOException {
+ this.currDay = (Calendar) currDay.clone();
+ this.siteId = new Integer(siteId);
+ this.repoLogsPath = new String(repoLogsPath);
+ this.portalLogPath = new String(portalLogPath);
+ this.portalMatomoID = new String(portalMatomoID);
+ }
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- System.out
- .println(
- Thread.currentThread().getName() + " (End) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
- ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
- ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- }
+ public void run() {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Start) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ try {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (End) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ }
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
- int i = 0;
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
+ int i = 0;
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]"))
- break;
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
- i++;
- } while (true);
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
- fs.close();
- }
- }
+ i++;
+ } while (true);
- public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
+ fs.close();
+ }
+ }
- Statement statement = ConnectDB.getHiveConnection().createStatement();
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
- ResultSet rs = statement
- .executeQuery(
- "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
- + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
+ Statement statement = ConnectDB.getHiveConnection().createStatement();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- // Getting all the piwikids in a list for logging reasons & limitting the list
- // to the max number of piwikids
- List piwikIdToVisit = new ArrayList();
+ ResultSet rs = statement
+ .executeQuery(
+ "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
+
+ // Getting all the piwikids in a list for logging reasons & limitting the list
+ // to the max number of piwikids
+ List piwikIdToVisit = new ArrayList();
while (rs.next())
piwikIdToVisit.add(rs.getInt(1));
- logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
+ logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
- if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
- ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
- logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
- piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
- }
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
+ logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
- logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
+ logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
+ // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
+ for (int siteId : piwikIdToVisit) {
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
- // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
- for (int siteId : piwikIdToVisit) {
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+ // Setting the ending period (last day of the month)
+ Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ end.add(Calendar.MONTH, +1);
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+ logger.info("Now working on piwikId: " + siteId);
- logger.info("Now working on piwikId: " + siteId);
+ PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog WHERE source=?");
+ st.setInt(1, siteId);
+ Date dateMax = null;
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
- PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
- .prepareStatement(
- "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklog WHERE source=?");
- st.setInt(1, siteId);
- Date dateMax=null;
- ResultSet rs_date = st.executeQuery();
- while (rs_date.next()) {
- logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ // logger.info("Date used " + currDay.toString());
+ // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ // executor.execute(worker);// calling execute method of ExecutorService
+ logger.info("Date used " + currDay.getTime().toString());
- for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
- // logger.info("Date used " + currDay.toString());
- // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- // executor.execute(worker);// calling execute method of ExecutorService
- logger.info("Date used " + currDay.getTime().toString());
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
+ } else {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ }
- if(dateMax!=null && currDay.getTime().compareTo(dateMax)<=0)
- logger.info("Date found in logs "+dateMax+ " and not downloanding Matomo logs for "+siteId);
- else
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ }
+ }
+ // executor.shutdown();
+ // while (!executor.isTerminated()) {
+ // }
+ // System.out.println("Finished all threads");
+ }
- }
- }
- // executor.shutdown();
- // while (!executor.isTerminated()) {
- // }
- // System.out.println("Finished all threads");
- }
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
+ int i = 0;
- int i = 0;
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]"))
- break;
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
+ i++;
+ } while (true);
- i++;
- } while (true);
-
- fs.close();
- }
+ fs.close();
+ }
}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java
index 6625c381b..6d5bdfac0 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java
@@ -551,17 +551,20 @@ public class PiwikStatsDB {
stmt.executeUpdate(createViewsStats);
logger.info("Created views_stats table");
- String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
- "AS SELECT coalesce(ds.source, vs.source) as source, " +
+ logger.info("Inserting data to usage_stats");
+ sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
+ "SELECT coalesce(ds.source, vs.source) as source, " +
"coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
"coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
"coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
"coalesce(ds.openaire, 0) as openaire_downloads, " +
"coalesce(vs.openaire, 0) as openaire_views " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
- ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS ds FULL OUTER JOIN " +
+ ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp AS vs ON ds.source=vs.source " +
"AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
- stmt.executeUpdate(createUsageStats);
+ stmt.executeUpdate(sql);
+ logger.info("Inserted data to usage_stats");
+
stmt.close();
ConnectDB.getHiveConnection().close();
@@ -1167,7 +1170,22 @@ public class PiwikStatsDB {
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp";
stmt.executeUpdate(sql);
- /*
+ logger.info("Creating usage_stats table");
+ String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
+ "AS SELECT coalesce(ds.source, vs.source) as source, " +
+ "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
+ "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
+ "coalesce(ds.openaire, 0) as openaire_downloads, " +
+ "coalesce(vs.openaire, 0) as openaire_views " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
+ ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
+ "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
+ stmt.executeUpdate(createUsageStats);
+ logger.info("Created usage_stats table");
+
+
+ /*
* logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
* ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql);
* logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java
index 71dfc6f61..06e350c9e 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java
@@ -286,9 +286,9 @@ public class SarcStats {
ConnectDB.getHiveConnection().setAutoCommit(false);
stmtImpala = ConnectDB.getImpalaConnection().createStatement();
- logger.info("Creating downloads_stats table");
+ logger.info("Creating downloads_stats table_tmp");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats "
+ + ".downloads_stats_tmp "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
@@ -296,7 +296,7 @@ public class SarcStats {
+ "`count` bigint, "
+ "`openaire` bigint)";
stmtHive.executeUpdate(createDownloadsStats);
- logger.info("Created downloads_stats table");
+ logger.info("Created downloads_stats_tmp table");
logger.info("Dropping sarc_sushilogtmp_impala table");
String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS "
@@ -341,20 +341,19 @@ public class SarcStats {
+ "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".datasource_results dr, "
+ ConnectDB.getStatsDBSchema() + ".result_pids ro "
- + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND dr.id=d.id AND dr.result=ro.id AND "
- + "s.rid=ro.pid AND ro.type='Digital Object Identifier' AND metric_type='ft_total' AND s.source='SARC-OJS'";
+ + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
stmtImpala.executeUpdate(createDownloadsStatsImpala);
logger.info("Creating downloads_stats_impala");
// Insert into downloads_stats
- logger.info("Inserting data from downloads_stats_impala into downloads_stats");
+ logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp");
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats SELECT * "
+ + ".downloads_stats_tmp SELECT * "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala";
stmtHive.executeUpdate(insertDStats);
- logger.info("Inserted into downloads_stats");
+ logger.info("Inserted into downloads_stats_tmp");
logger.info("Creating sushilog table");
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
index ae901dfa5..405b58bd5 100644
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
+++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
@@ -173,7 +173,7 @@ public class UsageStatsExporter {
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
- stmt.close();
+ stmt.close();
ConnectDB.getHiveConnection().close();
}
}