diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml
deleted file mode 100644
index a65c4514a..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-
-
-
- JDK_1.8
-
-
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
deleted file mode 100644
index 5593d4d87..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
+++ /dev/null
@@ -1,112 +0,0 @@
-
-
-
-
-
-
-
-
- dhp-workflows
- eu.dnetlib.dhp
- 1.1.7-SNAPSHOT
-
- 4.0.0
- dhp-usage-datasets-stats-update
-
-
-
- pl.project13.maven
- git-commit-id-plugin
- 2.1.15
-
-
-
- revision
-
-
-
-
- ${project.basedir}/../.git
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.6.1
-
-
- 1.8
-
-
-
-
-
- UTF-8
- UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
-
-
-
-
- org.apache.spark
- spark-core_2.11
- 2.2.0
-
-
- org.apache.spark
- spark-sql_2.11
- 2.4.5
-
-
- com.googlecode.json-simple
- json-simple
- 1.1.1
-
-
- org.json
- json
- 20180130
- jar
-
-
- org.apache.hive
- hive-jdbc
- ${cdh.hive.version}
-
-
- org.apache.hadoop
- hadoop-common
- ${cdh.hadoop.version}
-
-
- eu.dnetlib.dhp
- dhp-common
- ${project.version}
-
-
- com.mchange
- c3p0
- 0.9.5.2
-
-
- c3p0
- c3p0
- 0.9.1.2
- jar
-
-
- dhp-usage-datasets-stats-update
-
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh
deleted file mode 100755
index 9b4325508..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh
+++ /dev/null
@@ -1 +0,0 @@
-mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
deleted file mode 100644
index 25b30e8ad..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.Properties;
-
-import org.apache.log4j.Logger;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-import com.mchange.v2.c3p0.ComboPooledDataSource;
-
-public abstract class ConnectDB {
-
- public static Connection DB_HIVE_CONNECTION;
- public static Connection DB_IMPALA_CONNECTION;
-
- private static String dbHiveUrl;
- private static String dbImpalaUrl;
- private static String datasetUsageStatsDBSchema;
- private static String statsDBSchema;
- private final static Logger logger = Logger.getLogger(ConnectDB.class);
- private Statement stmt = null;
-
- static void init() throws ClassNotFoundException {
-
- dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
- dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
- datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema;
- statsDBSchema = ExecuteWorkflow.statsDBSchema;
-
- Class.forName("org.apache.hive.jdbc.HiveDriver");
- }
-
- public static Connection getHiveConnection() throws SQLException {
- if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
- return DB_HIVE_CONNECTION;
- } else {
- DB_HIVE_CONNECTION = connectHive();
-
- return DB_HIVE_CONNECTION;
- }
- }
-
- public static Connection getImpalaConnection() throws SQLException {
- if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
- return DB_IMPALA_CONNECTION;
- } else {
- DB_IMPALA_CONNECTION = connectImpala();
-
- return DB_IMPALA_CONNECTION;
- }
- }
-
- public static String getDataSetUsageStatsDBSchema() {
- return ConnectDB.datasetUsageStatsDBSchema;
- }
-
- public static String getStatsDBSchema() {
- return ConnectDB.statsDBSchema;
- }
-
- private static Connection connectHive() throws SQLException {
- /*
- * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
- * connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbHiveUrl);
- cpds.setUser("dimitris.pierrakos");
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
-
- cpds.setAcquireRetryAttempts(5);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
-
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
-
- logger.info("Opened database successfully");
-
- return cpds.getConnection();
-
- }
-
- private static Connection connectImpala() throws SQLException {
- /*
- * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
- * connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbImpalaUrl);
- cpds.setUser("dimitris.pierrakos");
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
-
- cpds.setAcquireRetryAttempts(5);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
-
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
-
- logger.info("Opened database successfully");
- return cpds.getConnection();
-
- }
-}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java
deleted file mode 100644
index 88db1f819..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java
+++ /dev/null
@@ -1,168 +0,0 @@
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import java.sql.Connection;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class DatasetsStatsDB {
-
- private String logPath;
- private String logRepoPath;
- private String logPortalPath;
-
- private Statement stmt = null;
-
- private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class);
-
- private String CounterRobotsURL;
- private ArrayList robotsList;
-
- public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception {
- this.logRepoPath = logRepoPath;
- this.logPortalPath = logPortalPath;
-
- }
-
- public void recreateDBAndTables() throws Exception {
- this.createDatabase();
- this.createTables();
- }
-
-// public void reCreateLogDirs() throws IllegalArgumentException, IOException {
-// FileSystem dfs = FileSystem.get(new Configuration());
-//
-// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
-// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
-//
-// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
-// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
-//
-// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
-// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
-//
-// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
-// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
-// }
- public ArrayList getRobotsList() {
- return robotsList;
- }
-
- public void setRobotsList(ArrayList robotsList) {
- this.robotsList = robotsList;
- }
-
- public String getCounterRobotsURL() {
- return CounterRobotsURL;
- }
-
- public void setCounterRobotsURL(String CounterRobotsURL) {
- this.CounterRobotsURL = CounterRobotsURL;
- }
-
- private void createDatabase() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
- stmt.executeUpdate(dropDatabase);
- } catch (Exception e) {
- logger.error("Failed to drop database: " + e);
- throw new Exception("Failed to drop database: " + e.toString(), e);
- }
-
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
- stmt.executeUpdate(createDatabase);
-
- } catch (Exception e) {
- logger.error("Failed to create database: " + e);
- throw new Exception("Failed to create database: " + e.toString(), e);
- }
- }
-
- private void createTables() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- // Create Reports table - This table should exist
- logger.info("Creating Reports Table");
- String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".datacitereports(reportid STRING, \n"
- + " name STRING, \n"
- + " source STRING,\n"
- + " release STRING,\n"
- + " createdby STRING,\n"
- + " report_start_date STRING,\n"
- + " report_end_date STRING)\n"
- + " CLUSTERED BY (reportid)\n"
- + " into 100 buckets stored as orc tblproperties('transactional'='true')";
-
- stmt.executeUpdate(sqlCreateTableDataciteReports);
- logger.info("Reports Table Created");
-
- // Create Datasets Table
- logger.info("Creating DataSets Table");
- String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".datasets(ds_type STRING,\n"
- + " ds_title STRING,\n"
- + " yop STRING,\n"
- + " uri STRING,\n"
- + " platform STRING,\n"
- + " data_type STRING,\n"
- + " publisher STRING,\n"
- + " publisher_id_type STRING,\n"
- + " publisher_id_value STRING,\n"
- + " ds_dates_type STRING,\n"
- + " ds_pub_date STRING,\n"
- + " ds_contributors STRING,\n"
- // + " ds_contributor_value array ,\n"
- + " reportid STRING)\n"
- + " CLUSTERED BY (ds_type)\n"
- + " into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableDataSets);
- logger.info("DataSets Table Created");
-
- // Create Datasets Performance Table
- logger.info("Creating DataSetsPerformance Table");
- String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".datasetsperformance(ds_type STRING,\n"
- + " period_end STRING,\n"
- + " period_from STRING,\n"
- + " access_method STRING,\n"
- + " metric_type STRING,\n"
- + " count INT,\n"
- + " country_counts STRING,\n"
- + " reportid STRING)\n"
- + " CLUSTERED BY (ds_type)\n"
- + " into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableDataSetsPerformance);
- logger.info("DataSetsPerformance Table Created");
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
-
- private Connection getConnection() throws SQLException {
- return ConnectDB.getHiveConnection();
- }
-}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
deleted file mode 100644
index a73b299ec..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Iterator;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.parser.ParseException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.gson.Gson;
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-
-/**
- * @author dpie
- */
-public class DownloadReportsListFromDatacite {
-
- private String dataciteBaseURL;
- private String dataciteReportPath;
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
-
- public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath)
- throws MalformedURLException, Exception {
-
- this.dataciteBaseURL = dataciteBaseURL;
- this.dataciteReportPath = dataciteReportPath;
- }
-
- public void downloadReportsList() throws ParseException {
- StringBuilder responseStrBuilder = new StringBuilder();
-
- Gson gson = new Gson();
-
- try {
- BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream());
- logger.info("Downloading from " + dataciteBaseURL);
-
- BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- String inputStr;
-
- while ((inputStr = streamReader.readLine()) != null) {
- responseStrBuilder.append(inputStr);
- }
- } catch (IOException e) {
- logger.info(e.getMessage());
- }
- JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class);
- JsonArray dataArray = jsonObject.getAsJsonArray("reports");
- ArrayList reportsList = new ArrayList();
- for (JsonElement element : dataArray) {
- reportsList.add(element.getAsJsonObject().get("id").getAsString());
- }
-
- Iterator it = reportsList.iterator();
- while (it.hasNext()) {
- String reportId = it.next().toString();
- String url = dataciteBaseURL + reportId;
-
- try {
- BufferedInputStream in = new BufferedInputStream(new URL(url).openStream());
- BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- String inputStr;
- StringBuilder responseStrBuilder2 = new StringBuilder();
- while ((inputStr = streamReader.readLine()) != null) {
- responseStrBuilder2.append(inputStr);
- }
- FileSystem fs = FileSystem.get(new Configuration());
- FSDataOutputStream fin = fs
- .create(
- new Path(dataciteReportPath + "/" + reportId + ".json"),
- true);
- byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
-
- fin.close();
-
- fin.close();
- } catch (IOException e) {
- System.out.println(e);
- }
- }
- }
-}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
deleted file mode 100644
index b28578e4b..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.log4j.BasicConfigurator;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class ExecuteWorkflow {
-
- static String dataciteBaseURL;
- static String dataciteReportPath;
- static String dbHiveUrl;
- static String dbImpalaUrl;
- static String datasetUsageStatsDBSchema;
- static String statsDBSchema;
- static boolean recreateDbAndTables;
- static boolean datasetsEmptyDirs;
- static boolean finalTablesVisibleToImpala;
-
- public static void main(String args[]) throws Exception {
-
- // Sending the logs to the console
- BasicConfigurator.configure();
-
- final ArgumentApplicationParser parser = new ArgumentApplicationParser(
- IOUtils
- .toString(
- UsageStatsExporter.class
- .getResourceAsStream(
- "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json")));
- parser.parseArgument(args);
-
- // Setting up the initial parameters
- dataciteBaseURL = parser.get("dataciteBaseURL");
- dataciteReportPath = parser.get("dataciteReportPath");
- dbHiveUrl = parser.get("dbHiveUrl");
- dbImpalaUrl = parser.get("dbImpalaUrl");
- datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema");
- statsDBSchema = parser.get("statsDBSchema");
-
- if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
- recreateDbAndTables = true;
- else
- recreateDbAndTables = false;
-
- if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true"))
- datasetsEmptyDirs = true;
- else
- datasetsEmptyDirs = false;
-
-// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
-// finalTablesVisibleToImpala = true;
-// else
-// finalTablesVisibleToImpala = false;
-//
- UsageStatsExporter usagestatsExport = new UsageStatsExporter();
- usagestatsExport.export();
- }
-
-}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
deleted file mode 100644
index ccb3eebd3..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import java.io.*;
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.sql.Array;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.ArrayList;
-import java.util.Base64;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.zip.GZIPInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.gson.Gson;
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-
-/**
- * @author dpie
- */
-public class ReadReportsListFromDatacite {
-
- private String dataciteReportPath;
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
-
- public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception {
-
- this.dataciteReportPath = dataciteReportPath;
- }
-
- public void readReports() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
- File folder = new File(dataciteReportPath);
- ArrayList jsonFiles = listHdfsDir(dataciteReportPath);
- for (String jsonFile : jsonFiles) {
- logger.info("Reading report file " + jsonFile);
- this.createTmpReportsTable(jsonFile);
-
- String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM "
- + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlSelectReportID);
- ResultSet rstmpReportID = stmt.getResultSet();
-
- String reportID = null;
- while (rstmpReportID.next()) {
- reportID = rstmpReportID.getString(1);
- }
-
- logger.info("Checking report with id " + reportID);
- String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".datacitereports where reportid=?";
- PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists);
- stGetReportID.setString(1, reportID);
-
- ResultSet rsCheckIfReportExist = stGetReportID.executeQuery();
-
- if (rsCheckIfReportExist.next()) {
- logger.info("Report found with ID " + reportID);
- dropTmpReportsTable();
- } else {
- String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
- + " .datacitereports "
- + "SELECT\n"
- + " get_json_object(json, '$.report.id') AS reportid,\n"
- + " get_json_object(json, '$.report.report-header.report-name') AS name,\n"
- + " get_json_object(json, '$.report.report-header.report-id') AS source,\n"
- + " get_json_object(json, '$.report.report-header.release') AS release,\n"
- + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n"
- + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n"
- + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n"
- + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlInsertReport);
-
- logger.info("Report added");
-
- logger.info("Adding datasets");
- String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM "
- + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlSelecteDatasetsArray);
- ResultSet rstmpReportDatasets = stmt.getResultSet();
-
- if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) {
- String[] listDatasets = rstmpReportDatasets.getString(1).split(",");
- logger.info("Datasets found " + listDatasets.length);
-
- for (int i = 0; i < listDatasets.length; i++) {
-
- String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
- + " .datasets "
- + "SELECT\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-id[0].value') AS ds_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-title') AS ds_title,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].publisher-id.type[0]') AS publisher_id_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].publisher-id.value[0]') AS publisher_id_value,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-dates.type[0]') AS ds_dates_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-dates.value[0]') AS ds_dates_value,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-contributors') AS ds_contributors,\n"
- + " get_json_object(json, '$.report.id') AS reportid \n"
- + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlInsertDataSets);
-
- logger.info("Dataset added " + i);
-
- logger.info("Adding Dataset Performance");
- String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets["
- + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlSelecteDatasetsPerformance);
- ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet();
- if (rstmpReportDatasetsPerformance.next()
- && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) {
- String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(",");
- logger.info("Datasets Performance found " + listDatasetsPerformance.length);
- for (int j = 0; j < listDatasetsPerformance.length; j++) {
- String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets["
- + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".tmpjson";
- stmt.execute(sqlSelecteDatasetsPerformanceInstance);
- ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet();
- if (rstmpReportDatasetsPerformanceInstance.next()
- && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) {
- String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance
- .getString(1)
- .split(",");
- logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length);
- for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) {
- String sqlInsertDataSetsPerformance = "INSERT INTO "
- + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance "
- + "SELECT\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].dataset-id[0].value') AS ds_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].period.end-date') AS period_end,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].period.begin-date') AS period_from,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].instance[" + k
- + "].access-method') AS access_method,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].instance[" + k
- + "].metric-type') AS metric_type,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].instance[" + k + "].count') AS count,\n"
- + " get_json_object(json, '$.report.report-datasets[" + i
- + "].performance[" + j + "].instance[" + k
- + "].country-counts') AS country_counts,\n"
- + " get_json_object(json, '$.report.id') AS reportid \n"
- + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlInsertDataSetsPerformance);
- }
- }
- }
- }
- logger.info("DatasetPerformance added for dataset" + i);
- }
- }
- logger.info("Adding gzip performance");
- String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM "
- + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(sqlSelecteReportSubsets);
- ResultSet rstmpReportSubsets = stmt.getResultSet();
- if (rstmpReportSubsets.next()) {
- String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1));
- this.readCompressedReport(unCompressedReport, reportID);
- }
- }
- }
- this.dropTmpReportsTable();
- }
-
- public void readCompressedReport(String report, String reportId) throws Exception {
- Gson gson = new Gson();
- JsonObject jsonObject = gson.fromJson(report, JsonObject.class);
-
- JsonArray jsonReportDatasets;
- if (jsonObject.getAsJsonArray("report_datasets") != null) {
- jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets");
- } else {
- jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets");
- }
-
- for (JsonElement datasetElement : jsonReportDatasets) {
- // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title");
- String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString();
- String yop = datasetElement.getAsJsonObject().get("yop").getAsString();
- String uri = datasetElement.getAsJsonObject().get("uri").getAsString();
- String platform = datasetElement.getAsJsonObject().get("platform").getAsString();
- String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString();
- String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString();
-
- JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id");
- String publisher_id_type = "";
- String publisher_id_value = "";
- for (JsonElement publisher_id_Element : publisher_id) {
- publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString();
- publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString();
- }
- JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates");
- String ds_dates_type = "";
- String ds_dates_value = "";
- for (JsonElement datasetDaysElement : dataset_days) {
- ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString();
- ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString();
- }
-
- JsonArray datasetContributors = null;
- String ds_contributor_type = "";
- String[] ds_contributor_values = null;
- Array ds_contributor_valuesArr = null;
-
- if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) {
- datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors");
-
- JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id");
- String doi = "";
- for (JsonElement datasetIDElement : datasetid)
-//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString());
- {
- doi = datasetIDElement.getAsJsonObject().get("value").getAsString();
- }
-
- String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
- + " .datasets(ds_type,"
- + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value,"
- + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) ";
-
- PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset);
-
- pstmtDataset.setString(1, doi);
- pstmtDataset.setString(2, dataset_title);
- pstmtDataset.setString(3, yop);
- pstmtDataset.setString(4, uri);
- pstmtDataset.setString(5, platform);
- pstmtDataset.setString(6, data_type);
- pstmtDataset.setString(7, publisher);
- pstmtDataset.setString(8, publisher_id_type);
- pstmtDataset.setString(9, publisher_id_value);
- pstmtDataset.setString(10, ds_dates_type);
- pstmtDataset.setString(11, ds_dates_value);
- pstmtDataset.setString(13, datasetContributors.getAsString());
- pstmtDataset.setString(14, reportId);
-
- pstmtDataset.execute();
- logger.info("Dataset from compressed report addded " + doi);
- /*
- * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for
- * (JsonElement performanceElement : performance) { JsonObject period =
- * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date =
- * period.getAsJsonObject().get("end-date").getAsString(); String begin_date =
- * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance =
- * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement :
- * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject
- * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set
- * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[]
- * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0;
- * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] =
- * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text",
- * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype
- * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod =
- * instanceElement.getAsJsonObject().get("access-method").getAsString(); String
- * sqlInsertDatasetPerformance =
- * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)"
- * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance);
- * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count);
- * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date);
- * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod);
- * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count);
- * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8,
- * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute();
- * } }
- */
- }
- }
-
- }
-
- private ArrayList listHdfsDir(String dir) throws Exception {
-
- FileSystem hdfs = FileSystem.get(new Configuration());
- RemoteIterator Files;
- ArrayList fileNames = new ArrayList<>();
-
- try {
- Path exportPath = new Path(hdfs.getUri() + dir);
- Files = hdfs.listFiles(exportPath, false);
- while (Files.hasNext()) {
- String fileName = Files.next().getPath().toString();
- fileNames.add(fileName);
- }
-
- hdfs.close();
- } catch (Exception e) {
- logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir));
- throw new Exception("HDFS file path with exported data does not exist : " + dir, e);
- }
-
- return fileNames;
- }
-
- private String readHDFSFile(String filename) throws Exception {
- String result;
- try {
-
- FileSystem fs = FileSystem.get(new Configuration());
- // log.info("reading file : " + filename);
-
- BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
-
- StringBuilder sb = new StringBuilder();
- String line = br.readLine();
-
- while (line != null) {
- sb.append(line);
- // sb.append(line);
- line = br.readLine();
- }
- // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
- result = sb.toString().trim();
- // fs.close();
- } catch (Exception e) {
- throw new Exception(e);
- }
-
- return result;
- }
-
- public static String uncompressString(String zippedBase64Str)
- throws IOException {
- String result = null;
-
- // In my solr project, I use org.apache.solr.common.util.Base64.
- // byte[] bytes =
- // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str);
- byte[] bytes = Base64.getDecoder().decode(zippedBase64Str);
- GZIPInputStream zi = null;
- try {
- zi = new GZIPInputStream(new ByteArrayInputStream(bytes));
- result = IOUtils.toString(zi);
- } finally {
- IOUtils.closeQuietly(zi);
- }
- return result;
- }
-
- private void createTmpReportsTable(String jsonFile) throws SQLException {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- dropTmpReportsTable();
- String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".tmpjson (json STRING)";
- stmt.executeUpdate(createTmpTable);
- logger.info("Tmp Table Created");
-
- String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE "
- + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- stmt.execute(insertJsonReport);
- logger.info("JSON Report File inserted to tmpjson Table");
- }
-
- private void dropTmpReportsTable() throws SQLException {
- logger.info("Dropping tmpjson Table");
- String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- stmt.executeUpdate(dropTmpTable);
- logger.info("Dropped tmpjson Table");
-
- }
-
-}
-
-/*
- * PreparedStatement prepStatem = conn.
- * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)"
- * );
- */
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
deleted file mode 100644
index 7b07fbc25..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
+++ /dev/null
@@ -1,111 +0,0 @@
-
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import java.io.IOException;
-import java.sql.SQLException;
-import java.sql.Statement;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Main class for downloading and processing Usage statistics
- *
- * @author D. Pierrakos, S. Zoupanos
- */
-public class UsageStatsExporter {
-
- private Statement stmt = null;
-
- public UsageStatsExporter() {
-
- }
-
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
-
- private void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath);
- dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true);
-
- logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath));
-
- }
-
- public void export() throws Exception {
-
- logger.info("Initialising DB properties");
- ConnectDB.init();
- ConnectDB.getHiveConnection();
-
- if (ExecuteWorkflow.recreateDbAndTables) {
- DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", "");
- datasetsDB.recreateDBAndTables();
- }
- logger.info("Initializing the download logs module");
- DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL,
- ExecuteWorkflow.dataciteReportPath);
-
- if (ExecuteWorkflow.datasetsEmptyDirs) {
- logger.info("Downloading Reports List From Datacite");
- drfd.downloadReportsList();
- logger.info("Reports List has been downloaded");
- }
-
- ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite(
- ExecuteWorkflow.dataciteReportPath);
- logger.info("Store Reports To DB");
- readReportsListFromDatacite.readReports();
- logger.info("Reports Stored To DB");
- }
-
-// runImpalaQuery();
- /*
- * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
- * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module");
- * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
- * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories");
- * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if
- * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs(
- * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); }
- * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl =
- * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
- * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) {
- * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables");
- * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
- * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) {
- * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if
- * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs");
- * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); }
- * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if
- * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs();
- * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if
- * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables");
- * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs");
- * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) {
- * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) {
- * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if
- * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if
- * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray,
- * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) {
- * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if
- * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the
- * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) {
- * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End");
- */
-}
-/*
- * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt =
- * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " +
- * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA "
- * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
- * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
- * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close();
- * ConnectDB.getHiveConnection().close(); }
- */
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json
deleted file mode 100644
index f8d51a882..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json
+++ /dev/null
@@ -1,56 +0,0 @@
-[
- {
- "paramName": "dbu",
- "paramLongName": "dataciteBaseURL",
- "paramDescription": "URL of Datacite Reports Endpoint",
- "paramRequired": true
- },
- {
- "paramName": "drp",
- "paramLongName": "dataciteReportPath",
- "paramDescription": "Path for Datacite Reports",
- "paramRequired": true
- },
- {
- "paramName": "dbhu",
- "paramLongName": "dbHiveUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dbiu",
- "paramLongName": "dbImpalaUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dusdbs",
- "paramLongName": "datasetUsageStatsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "sdbs",
- "paramLongName": "statsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "rdbt",
- "paramLongName": "recreateDbAndTables",
- "paramDescription": "Re-create database and initial tables?",
- "paramRequired": true
- },
- {
- "paramName": "pwed",
- "paramLongName": "datasetsEmptyDirs",
- "paramDescription": "Empty piwik directories?",
- "paramRequired": true
- },
- {
- "paramName": "ftvi",
- "paramLongName": "finalTablesVisibleToImpala",
- "paramDescription": "Make the dataset_usage_stats, visible to Impala",
- "paramRequired": true
- }
-]
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml
deleted file mode 100644
index b5c807378..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
- jobTracker
- ${jobTracker}
-
-
- nameNode
- ${nameNode}
-
-
- oozie.use.system.libpath
- true
-
-
- oozie.action.sharelib.for.spark
- spark2
-
-
- hiveMetastoreUris
- thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
-
-
- hiveJdbcUrl
- jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1
-
-
- impalaJdbcUrl
- jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl;
-
-
- oozie.wf.workflow.notification.url
- {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status
-
-
- oozie.use.system.libpath
- true
-
-
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml
deleted file mode 100644
index 3a81e497d..000000000
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
- hiveMetastoreUris
- Hive server metastore URIs
-
-
- hiveJdbcUrl
- Hive server jdbc url
-
-
- impalaJdbcUrl
- Impala server jdbc url
-
-
-
-
- ${jobTracker}
- ${nameNode}
-
-
- hive.metastore.uris
- ${hiveMetastoreUris}
-
-
- mapreduce.job.queuename
- ${queueName}
-
-
- oozie.launcher.mapred.job.queue.name
- ${oozieLauncherQueueName}
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
- eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow
- --dataciteBaseURL
- ${dataciteBaseURL}
- --dataciteReportPath
- ${dataciteReportPath}
- --dbHiveUrl
- ${hiveJdbcUrl}
- --dbImpalaUrl
- ${impalaJdbcUrl}
- --datasetUsageStatsDBSchema
- ${datasetUsageStatsDBSchema}
- --statsDBSchema
- ${statsDBSchema}
- --recreateDbAndTables
- ${recreateDbAndTables}
- --datasetsEmptyDirs
- ${datasetsEmptyDirs}
- --finalTablesVisibleToImpala
- ${finalTablesVisibleToImpala}
-
-
-
-
-
-
-
-
diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml
deleted file mode 100644
index b56257ee5..000000000
--- a/dhp-workflows/dhp-usage-stats-update/pom.xml
+++ /dev/null
@@ -1,78 +0,0 @@
-
-
-
-
-
-
-
-
- dhp-workflows
- eu.dnetlib.dhp
- 1.1.7-SNAPSHOT
-
- 4.0.0
- dhp-usage-stats-update
-
-
- UTF-8
- UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
-
-
-
-
- org.apache.spark
- spark-core_2.11
- 2.2.0
-
-
- org.apache.spark
- spark-sql_2.11
- 2.4.5
-
-
- com.googlecode.json-simple
- json-simple
- 1.1.1
-
-
- org.json
- json
- 20180130
- jar
-
-
- org.apache.hive
- hive-jdbc
- ${cdh.hive.version}
-
-
- org.apache.hadoop
- hadoop-common
- ${cdh.hadoop.version}
-
-
- eu.dnetlib.dhp
- dhp-common
- ${project.version}
-
-
- c3p0
- c3p0
- 0.9.1.2
- jar
-
-
-
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java
deleted file mode 100644
index 29dd5648b..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.Properties;
-
-import org.apache.log4j.Logger;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-import com.mchange.v2.c3p0.ComboPooledDataSource;
-
-public abstract class ConnectDB {
-
- public static Connection DB_HIVE_CONNECTION;
- public static Connection DB_IMPALA_CONNECTION;
-
- private static String dbHiveUrl;
- private static String dbImpalaUrl;
- private static String usageStatsDBSchema;
- private static String statsDBSchema;
- private final static Logger log = Logger.getLogger(ConnectDB.class);
-
- static void init() throws ClassNotFoundException {
-
- dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
- dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
- usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
- statsDBSchema = ExecuteWorkflow.statsDBSchema;
-
- Class.forName("org.apache.hive.jdbc.HiveDriver");
- }
-
- public static Connection getHiveConnection() throws SQLException {
- if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
- return DB_HIVE_CONNECTION;
- } else {
- DB_HIVE_CONNECTION = connectHive();
-
- return DB_HIVE_CONNECTION;
- }
- }
-
- public static Connection getImpalaConnection() throws SQLException {
- if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
- return DB_IMPALA_CONNECTION;
- } else {
- DB_IMPALA_CONNECTION = connectImpala();
-
- return DB_IMPALA_CONNECTION;
- }
- }
-
- public static String getUsageStatsDBSchema() {
- return ConnectDB.usageStatsDBSchema;
- }
-
- public static String getStatsDBSchema() {
- return ConnectDB.statsDBSchema;
- }
-
- private static Connection connectHive() throws SQLException {
- /*
- * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
- * connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbHiveUrl);
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
-
- cpds.setAcquireRetryAttempts(5);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
-
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
- return cpds.getConnection();
-
- }
-
- private static Connection connectImpala() throws SQLException {
- /*
- * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
- * connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbImpalaUrl);
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
-
- cpds.setAcquireRetryAttempts(5);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
-
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
-
- return cpds.getConnection();
-
- }
-
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java
deleted file mode 100644
index 50b951cbc..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.text.SimpleDateFormat;
-import java.util.Calendar;
-import java.util.Date;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.log4j.BasicConfigurator;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class ExecuteWorkflow {
-
- static String matomoAuthToken;
- static String matomoBaseURL;
- static String repoLogPath;
- static String portalLogPath;
- static String portalMatomoID;
- static String irusUKBaseURL;
- static String irusUKReportPath;
- static String sarcsReportPathArray;
- static String sarcsReportPathNonArray;
- static String lareferenciaLogPath;
- static String lareferenciaBaseURL;
- static String lareferenciaAuthToken;
- static String dbHiveUrl;
- static String dbImpalaUrl;
- static String usageStatsDBSchema;
- static String statsDBSchema;
- static boolean recreateDbAndTables;
-
- static boolean piwikEmptyDirs;
- static boolean downloadPiwikLogs;
- static boolean processPiwikLogs;
-
- static Calendar startingLogPeriod;
- static Calendar endingLogPeriod;
- static int numberOfPiwikIdsToDownload;
- static int numberOfSiteIdsToDownload;
-
- static boolean laReferenciaEmptyDirs;
- static boolean downloadLaReferenciaLogs;
- static boolean processLaReferenciaLogs;
-
- static boolean irusCreateTablesEmptyDirs;
- static boolean irusDownloadReports;
- static boolean irusProcessStats;
- static int irusNumberOfOpendoarsToDownload;
-
- static boolean sarcCreateTablesEmptyDirs;
- static boolean sarcDownloadReports;
- static boolean sarcProcessStats;
- static int sarcNumberOfIssnToDownload;
-
- static boolean finalizeStats;
- static boolean finalTablesVisibleToImpala;
-
- static int numberOfDownloadThreads;
-
- public static void main(String args[]) throws Exception {
-
- // Sending the logs to the console
- BasicConfigurator.configure();
-
- final ArgumentApplicationParser parser = new ArgumentApplicationParser(
- IOUtils
- .toString(
- UsageStatsExporter.class
- .getResourceAsStream(
- "/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json")));
- parser.parseArgument(args);
-
- // Setting up the initial parameters
- matomoAuthToken = parser.get("matomoAuthToken");
- matomoBaseURL = parser.get("matomoBaseURL");
- repoLogPath = parser.get("repoLogPath");
- portalLogPath = parser.get("portalLogPath");
- portalMatomoID = parser.get("portalMatomoID");
- irusUKBaseURL = parser.get("irusUKBaseURL");
- irusUKReportPath = parser.get("irusUKReportPath");
- sarcsReportPathArray = parser.get("sarcsReportPathArray");
- sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
- lareferenciaLogPath = parser.get("lareferenciaLogPath");
- lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
- lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
-
- dbHiveUrl = parser.get("dbHiveUrl");
- dbImpalaUrl = parser.get("dbImpalaUrl");
- usageStatsDBSchema = parser.get("usageStatsDBSchema");
- statsDBSchema = parser.get("statsDBSchema");
-
- if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
- recreateDbAndTables = true;
- else
- recreateDbAndTables = false;
-
- if (parser.get("piwikEmptyDirs").toLowerCase().equals("true"))
- piwikEmptyDirs = true;
- else
- piwikEmptyDirs = false;
-
- if (parser.get("downloadPiwikLogs").toLowerCase().equals("true"))
- downloadPiwikLogs = true;
- else
- downloadPiwikLogs = false;
-
- if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
- processPiwikLogs = true;
- else
- processPiwikLogs = false;
-
- String startingLogPeriodStr = parser.get("startingLogPeriod");
- Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
- startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
-
- String endingLogPeriodStr = parser.get("endingLogPeriod");
- Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
- endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
-
- numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
- numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
-
- if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true"))
- laReferenciaEmptyDirs = true;
- else
- laReferenciaEmptyDirs = false;
-
- if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true"))
- downloadLaReferenciaLogs = true;
- else
- downloadLaReferenciaLogs = false;
-
- if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true"))
- processLaReferenciaLogs = true;
- else
- processLaReferenciaLogs = false;
-
- if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true"))
- irusCreateTablesEmptyDirs = true;
- else
- irusCreateTablesEmptyDirs = false;
- if (parser.get("irusDownloadReports").toLowerCase().equals("true"))
- irusDownloadReports = true;
- else
- irusDownloadReports = false;
- if (parser.get("irusProcessStats").toLowerCase().equals("true"))
- irusProcessStats = true;
- else
- irusProcessStats = false;
- irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
-
- if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true"))
- sarcCreateTablesEmptyDirs = true;
- else
- sarcCreateTablesEmptyDirs = false;
- if (parser.get("sarcDownloadReports").toLowerCase().equals("true"))
- sarcDownloadReports = true;
- else
- sarcDownloadReports = false;
- if (parser.get("sarcProcessStats").toLowerCase().equals("true"))
- sarcProcessStats = true;
- else
- sarcProcessStats = false;
- sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
-
- if (parser.get("finalizeStats").toLowerCase().equals("true"))
- finalizeStats = true;
- else
- finalizeStats = false;
- if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
- finalTablesVisibleToImpala = true;
- else
- finalTablesVisibleToImpala = false;
-
- numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
-
- UsageStatsExporter usagestatsExport = new UsageStatsExporter();
- usagestatsExport.export();
- }
-
- private static Calendar startingLogPeriodStr(Date date) {
-
- Calendar calendar = Calendar.getInstance();
- calendar.setTime(date);
- return calendar;
-
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java
deleted file mode 100644
index 011c90532..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java
+++ /dev/null
@@ -1,419 +0,0 @@
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-import java.net.URL;
-import java.net.URLConnection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.Statement;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class IrusStats {
-
- private String irusUKURL;
-
- private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
-
- public IrusStats(String irusUKURL) throws Exception {
- this.irusUKURL = irusUKURL;
- // The following may not be needed - It will be created when JSON tables are created
-// createTmpTables();
- }
-
- public void reCreateLogDirs() throws Exception {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
- dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
-
- logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
- }
-
- public void createTables() throws Exception {
- try {
- logger.info("Creating sushilog");
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog(source STRING, "
- + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
- + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableSushiLog);
- logger.info("Created sushilog");
-
- // To see how to apply to the ignore duplicate rules and indexes
-// stmt.executeUpdate(sqlCreateTableSushiLog);
-// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO sushilog "
-// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
-// + "sushilog.rid, sushilog.date "
-// + "FROM sushilog "
-// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlcreateRuleSushiLog);
-// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
-// stmt.executeUpdate(createSushiIndex);
- stmt.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Sushi Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
-
-// // The following may not be needed - It will be created when JSON tables are created
-// private void createTmpTables() throws Exception {
-// try {
-//
-// Statement stmt = ConnectDB.getConnection().createStatement();
-// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
-// stmt.executeUpdate(sqlCreateTableSushiLog);
-//
-// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
-// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
-// // stmt.executeUpdate(sqlCopyPublicSushiLog);
-// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO sushilogtmp "
-// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
-// + "sushilogtmp.rid, sushilogtmp.date "
-// + "FROM sushilogtmp "
-// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlcreateRuleSushiLog);
-//
-// stmt.close();
-// ConnectDB.getConnection().close();
-// log.info("Sushi Tmp Tables Created");
-// } catch (Exception e) {
-// log.error("Failed to create tables: " + e);
-// throw new Exception("Failed to create tables: " + e.toString(), e);
-// }
-// }
- public void processIrusStats() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
-
- logger.info("Dropping sushilogtmp_json table");
- String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sushilogtmp_json";
- stmt.executeUpdate(dropSushilogtmpJson);
- logger.info("Dropped sushilogtmp_json table");
-
- logger.info("Creating irus_sushilogtmp_json table");
- String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
- + " `ItemIdentifier` ARRAY<\n"
- + " struct<\n"
- + " Type: STRING,\n"
- + " Value: STRING\n"
- + " >\n"
- + " >,\n"
- + " `ItemPerformance` ARRAY<\n"
- + " struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >\n"
- + " >\n"
- + ")\n"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(createSushilogtmpJson);
- logger.info("Created irus_sushilogtmp_json table");
-
- logger.info("Dropping irus_sushilogtmp table");
- String dropSushilogtmp = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp";
- stmt.executeUpdate(dropSushilogtmp);
- logger.info("Dropped irus_sushilogtmp table");
-
- logger.info("Creating irus_sushilogtmp table");
- String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp(source STRING, repository STRING, "
- + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
- + "tblproperties('transactional'='true')";
- stmt.executeUpdate(createSushilogtmp);
- logger.info("Created irus_sushilogtmp table");
-
- logger.info("Inserting to irus_sushilogtmp table");
- String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
- + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
- + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
- + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
- + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
- + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
- + "WHERE `ItemIdent`.`Type`= 'OAI'";
- stmt.executeUpdate(insertSushilogtmp);
- logger.info("Inserted to irus_sushilogtmp table");
-
- logger.info("Creating downloads_stats table");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmt.executeUpdate(createDownloadsStats);
- logger.info("Created downloads_stats table");
-
- logger.info("Inserting into downloads_stats");
- String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_oids ro "
- + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
- stmt.executeUpdate(insertDStats);
- logger.info("Inserted into downloads_stats");
-
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmt.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
-
- logger.info("Inserting to sushilog table");
- String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
- + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp";
- stmt.executeUpdate(insertToShushilog);
- logger.info("Inserted to sushilog table");
-
- ConnectDB.getHiveConnection().close();
- }
-
- public void getIrusRRReport(String irusUKReportPath) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
-
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
-
- String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
- + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
- + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
-
- logger.info("(getIrusRRReport) Getting report: " + reportUrl);
-
- String text = getJson(reportUrl, "", "");
-
- List opendoarsToVisit = new ArrayList();
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = (JSONObject) parser.parse(text);
- jsonObject = (JSONObject) jsonObject.get("ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Customer");
- JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
- int i = 0;
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
- for (Object identifier : itemIdentifier) {
- JSONObject opendoar = (JSONObject) identifier;
- if (opendoar.get("Type").toString().equals("OpenDOAR")) {
- i++;
- opendoarsToVisit.add(opendoar.get("Value").toString());
- break;
- }
- }
- // break;
- }
-
- logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
-
- if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
- && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
- opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
- }
-
- logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
-
- for (String opendoar : opendoarsToVisit) {
- logger.info("Now working on openDoar: " + opendoar);
- this.getIrusIRReport(opendoar, irusUKReportPath);
- }
-
- logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
- }
-
- private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
-
- logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
-
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
-
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
-
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
-
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
- st.setString(1, "opendoar____::" + opendoar);
- ResultSet rs_date = st.executeQuery();
- Date dateMax = null;
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
- int batch_size = 0;
-
- if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
- } else {
- while (start.before(end)) {
- logger.info("date: " + simpleDateFormat.format(start.getTime()));
- String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
- + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
- + "&RepositoryIdentifier=opendoar%3A" + opendoar
- + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
- start.add(Calendar.MONTH, 1);
-
- logger.info("Downloading file: " + reportUrl);
- String text = getJson(reportUrl, "", "");
- if (text == null) {
- continue;
- }
-
- FileSystem fs = FileSystem.get(new Configuration());
- String filePath = irusUKReportPath + "/" + "IrusIRReport_"
- + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePath);
- FSDataOutputStream fin = fs.create(new Path(filePath), true);
-
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = (JSONObject) parser.parse(text);
- jsonObject = (JSONObject) jsonObject.get("ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Customer");
- JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
- if (jsonArray == null) {
- continue;
- }
- String oai = "";
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- fin.write(jsonObjectRow.toJSONString().getBytes());
- fin.writeChar('\n');
- }
-
- fin.close();
- }
-
- }
- //ConnectDB.getHiveConnection().close();
-
- logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
- }
-
- private String getJson(String url) throws Exception {
- try {
- System.out.println("===> Connecting to: " + url);
- URL website = new URL(url);
- System.out.println("Connection url -----> " + url);
- URLConnection connection = website.openConnection();
-
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
-// response.append("\n");
- }
- }
-
- System.out.println("response ====> " + response.toString());
-
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + e);
- System.out.println("Failed to get URL: " + e);
- throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- }
-
- private String getJson(String url, String username, String password) throws Exception {
- // String cred=username+":"+password;
- // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- response.append("\n");
- }
- }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL", e);
- return null;
- }
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java
deleted file mode 100644
index 7a61b1f46..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java
+++ /dev/null
@@ -1,265 +0,0 @@
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-import java.net.URL;
-import java.net.URLConnection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.Statement;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class LaReferenciaDownloadLogs {
-
- private final String piwikUrl;
- private Date startDate;
- private final String tokenAuth;
-
- /*
- * The Piwik's API method
- */
- private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
- private final String format = "&format=json";
- private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
-
- private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
-
- public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
- this.piwikUrl = piwikUrl;
- this.tokenAuth = tokenAuth;
- this.createTables();
-// this.createTmpTables();
- }
-
- public void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
-
- logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
- }
-
- private void createTables() throws Exception {
- try {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating LaReferencia tables");
- String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
- + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
- + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
- + "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableLareferenciaLog);
- logger.info("Created LaReferencia tables");
-// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialog "
-// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
-// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
-// + "FROM lareferencialog "
-// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
-// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
-// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Lareferencia Tables Created");
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- // System.exit(0);
- }
- }
-
-// private void createTmpTables() throws Exception {
-//
-// try {
-// Statement stmt = ConnectDB.getConnection().createStatement();
-// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
-// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialogtmp "
-// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
-// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
-// + "FROM lareferencialogtmp "
-// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
-// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
-//
-// stmt.close();
-// log.info("Lareferencia Tmp Tables Created");
-//
-// } catch (Exception e) {
-// log.error("Failed to create tmptables: " + e);
-// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
-// // System.exit(0);
-// }
-// }
- private String getPiwikLogUrl() {
- return piwikUrl + "/";
- }
-
- private String getJson(String url) throws Exception {
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
-
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
-// response.append("\n");
- }
- }
-
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + e);
- throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- }
-
- public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
-
- String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
- String content = "";
-
- List siteIdsToVisit = new ArrayList();
-
- // Getting all the siteIds in a list for logging reasons & limiting the list
- // to the max number of siteIds
- content = getJson(baseApiUrl);
- JSONParser parser = new JSONParser();
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
- }
- logger.info("Found the following siteIds for download: " + siteIdsToVisit);
-
- if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
- && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
- siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
- }
-
- logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
-
- for (int siteId : siteIdsToVisit) {
- logger.info("Now working on LaReferencia MatomoId: " + siteId);
- this.GetLaReFerenciaLogs(repoLogsPath, siteId);
- }
- }
-
- public void GetLaReFerenciaLogs(String repoLogsPath,
- int laReferencialMatomoID) throws Exception {
-
- logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
-
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("Starting period for log download: " + sdf.format(start.getTime()));
-
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("Ending period for log download: " + sdf.format(end.getTime()));
-
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
- + ".lareferencialog WHERE matomoid=?");
- st.setInt(1, laReferencialMatomoID);
- Date dateMax = null;
-
- ResultSet rs_date = st.executeQuery();
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
-
- for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
- Date date = currDay.getTime();
- if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID);
- } else {
- logger
- .info(
- "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
- + sdf.format(date));
-
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- outFolder = repoLogsPath;
-
- FileSystem fs = FileSystem.get(new Configuration());
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
- true);
-
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
- int i = 0;
-
- JSONParser parser = new JSONParser();
- do {
- String apiUrl = baseApiUrl;
-
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
-
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
-
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- fin.write(jsonObjectRaw.toJSONString().getBytes());
- fin.writeChar('\n');
- }
-
- logger
- .info(
- "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
- + " and for "
- + sdf.format(date));
- i++;
- } while (true);
- fin.close();
- }
- }
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java
deleted file mode 100644
index 747b5ce0e..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java
+++ /dev/null
@@ -1,436 +0,0 @@
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-import java.net.URLDecoder;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.sql.Timestamp;
-import java.text.SimpleDateFormat;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class LaReferenciaStats {
-
- private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class);
-
- private String logRepoPath;
-
- private Statement stmt = null;
-
- private String CounterRobotsURL;
- private ArrayList robotsList;
-
- public LaReferenciaStats(String logRepoPath) throws Exception {
- this.logRepoPath = logRepoPath;
- this.createTables();
-// this.createTmpTables();
- }
-
- /*
- * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) {
- * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } }
- */
- private void createTables() throws Exception {
- try {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating LaReferencia tables");
- String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " +
- "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
- "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
- "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableLareferenciaLog);
- logger.info("Created LaReferencia tables");
-// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialog "
-// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
-// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
-// + "FROM lareferencialog "
-// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
-// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
-// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Lareferencia Tables Created");
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- // System.exit(0);
- }
- }
-
-// private void createTmpTables() throws Exception {
-//
-// try {
-// Statement stmt = ConnectDB.getConnection().createStatement();
-// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
-// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialogtmp "
-// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
-// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
-// + "FROM lareferencialogtmp "
-// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
-// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
-//
-// stmt.close();
-// log.info("Lareferencia Tmp Tables Created");
-//
-// } catch (Exception e) {
-// log.error("Failed to create tmptables: " + e);
-// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
-// // System.exit(0);
-// }
-// }
-
- public void processLogs() throws Exception {
- try {
- logger.info("Processing LaReferencia repository logs");
- processlaReferenciaLog();
- logger.info("LaReferencia repository logs process done");
-
- logger.info("LaReferencia removing double clicks");
- removeDoubleClicks();
- logger.info("LaReferencia removed double clicks");
-
- logger.info("LaReferencia creating viewsStats");
- viewsStats();
- logger.info("LaReferencia created viewsStats");
- logger.info("LaReferencia creating downloadsStats");
- downloadsStats();
- logger.info("LaReferencia created downloadsStats");
- logger.info("LaReferencia updating Production Tables");
- updateProdTables();
- logger.info("LaReferencia updated Production Tables");
-
- } catch (Exception e) {
- logger.error("Failed to process logs: " + e);
- throw new Exception("Failed to process logs: " + e.toString(), e);
- }
- }
-
- public void processlaReferenciaLog() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
-
- logger.info("Dropping lareferencialogtmp_json table");
- String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".lareferencialogtmp_json";
- stmt.executeUpdate(drop_lareferencialogtmp_json);
- logger.info("Dropped lareferencialogtmp_json table");
-
- logger.info("Creating lareferencialogtmp_json");
- String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".lareferencialogtmp_json(\n" +
- " `idSite` STRING,\n" +
- " `idVisit` STRING,\n" +
- " `country` STRING,\n" +
- " `referrerName` STRING,\n" +
- " `browser` STRING,\n" +
- " `repItem` STRING,\n" +
- " `actionDetails` ARRAY<\n" +
- " struct<\n" +
- " timestamp: STRING,\n" +
- " type: STRING,\n" +
- " url: STRING,\n" +
- " `customVariables`: struct<\n" +
- " `1`: struct<\n" +
- " `customVariablePageValue1`: STRING\n" +
- " >,\n" +
- " `2`: struct<\n" +
- " `customVariablePageValue2`: STRING\n" +
- " >\n" +
- " >\n" +
- " >\n" +
- " >" +
- ")\n" +
- "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
- "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" +
- "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_lareferencialogtmp_json);
- logger.info("Created lareferencialogtmp_json");
-
- logger.info("Dropping lareferencialogtmp table");
- String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".lareferencialogtmp";
- stmt.executeUpdate(drop_lareferencialogtmp);
- logger.info("Dropped lareferencialogtmp table");
-
- logger.info("Creating lareferencialogtmp");
- String create_lareferencialogtmp = "CREATE TABLE " +
- ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " +
- "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
- "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
- "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(create_lareferencialogtmp);
- logger.info("Created lareferencialogtmp");
-
- logger.info("Inserting into lareferencialogtmp");
- String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " +
- "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " +
- "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " +
- "actiondetail.type as action, actiondetail.url as url, " +
- "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
- "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
- "referrerName as referrer_name, browser as agent " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " +
- "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
- stmt.executeUpdate(insert_lareferencialogtmp);
- logger.info("Inserted into lareferencialogtmp");
-
- stmt.close();
- }
-
- public void removeDoubleClicks() throws Exception {
-
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Cleaning download double clicks");
- // clean download double clicks
- String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
- "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
- ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
- "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
- "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " +
- "AND p1.timestamp listHdfsDir(String dir) throws Exception {
- FileSystem hdfs = FileSystem.get(new Configuration());
- RemoteIterator Files;
- ArrayList fileNames = new ArrayList<>();
-
- try {
- Path exportPath = new Path(hdfs.getUri() + dir);
- Files = hdfs.listFiles(exportPath, false);
- while (Files.hasNext()) {
- String fileName = Files.next().getPath().toString();
- // log.info("Found hdfs file " + fileName);
- fileNames.add(fileName);
- }
- // hdfs.close();
- } catch (Exception e) {
- logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath));
- throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e);
- }
-
- return fileNames;
- }
-
- private String readHDFSFile(String filename) throws Exception {
- String result;
- try {
-
- FileSystem fs = FileSystem.get(new Configuration());
- // log.info("reading file : " + filename);
-
- BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
-
- StringBuilder sb = new StringBuilder();
- String line = br.readLine();
-
- while (line != null) {
- if (!line.equals("[]")) {
- sb.append(line);
- }
- // sb.append(line);
- line = br.readLine();
- }
- result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
- if (result.equals("")) {
- result = "[]";
- }
-
- // fs.close();
- } catch (Exception e) {
- logger.error(e.getMessage());
- throw new Exception(e);
- }
-
- return result;
- }
-
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java
deleted file mode 100644
index 8f7fffa9f..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java
+++ /dev/null
@@ -1,325 +0,0 @@
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-import java.net.Authenticator;
-import java.net.URL;
-import java.net.URLConnection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.Statement;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class PiwikDownloadLogs {
-
- private final String piwikUrl;
- private Date startDate;
- private final String tokenAuth;
-
- /*
- * The Piwik's API method
- */
- private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
- private final String format = "&format=json";
-
- private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
-
- public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
- this.piwikUrl = piwikUrl;
- this.tokenAuth = tokenAuth;
-
- }
-
- private String getPiwikLogUrl() {
- return "https://" + piwikUrl + "/";
- }
-
- private String getJson(String url) throws Exception {
- try {
- logger.debug("Connecting to download the JSON: " + url);
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
-
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- }
- }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + url + " Exception: " + e);
- throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
- }
- }
-
- class WorkerThread implements Runnable {
-
- private Calendar currDay;
- private int siteId;
- private String repoLogsPath;
- private String portalLogPath;
- private String portalMatomoID;
-
- public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws IOException {
- this.currDay = (Calendar) currDay.clone();
- this.siteId = new Integer(siteId);
- this.repoLogsPath = new String(repoLogsPath);
- this.portalLogPath = new String(portalLogPath);
- this.portalMatomoID = new String(portalMatomoID);
- }
-
- public void run() {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- System.out
- .println(
- Thread.currentThread().getName() + " (Start) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
- + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
- + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- try {
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
-
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- System.out
- .println(
- Thread.currentThread().getName() + " (End) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
- + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
- + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- }
-
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
-
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
-
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
-
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
-
- int i = 0;
-
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
-
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
-
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
-
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
-
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
-
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
-
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
-
- i++;
- } while (true);
-
- fs.close();
- }
- }
-
- public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
-
- Statement statement = ConnectDB.getHiveConnection().createStatement();
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
-
- ResultSet rs = statement
- .executeQuery(
- "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
- + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
-
- // Getting all the piwikids in a list for logging reasons & limitting the list
- // to the max number of piwikids
- List piwikIdToVisit = new ArrayList();
- while (rs.next())
- piwikIdToVisit.add(rs.getInt(1));
- logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
-
- if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
- && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
- logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
- piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
- }
-
- logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
-
- // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
- for (int siteId : piwikIdToVisit) {
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("Starting period for log download: " + sdf.format(start.getTime()));
-
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("Ending period for log download: " + sdf.format(end.getTime()));
-
- logger.info("Now working on piwikId: " + siteId);
-
- PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
- .prepareStatement(
- "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklog WHERE source=?");
- st.setInt(1, siteId);
- Date dateMax = null;
- ResultSet rs_date = st.executeQuery();
- while (rs_date.next()) {
- logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
-
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
-
- for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
- // logger.info("Date used " + currDay.toString());
- // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- // executor.execute(worker);// calling execute method of ExecutorService
- logger.info("Date used " + currDay.getTime().toString());
-
- if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
- } else {
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- }
-
- }
- }
- // executor.shutdown();
- // while (!executor.isTerminated()) {
- // }
- // System.out.println("Finished all threads");
- }
-
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
-
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
-
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
-
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
-
- int i = 0;
-
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
-
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
-
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
-
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
-
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
-
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
-
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
-
- i++;
- } while (true);
-
- fs.close();
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java
deleted file mode 100644
index 6d5bdfac0..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java
+++ /dev/null
@@ -1,1262 +0,0 @@
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-import java.net.URLDecoder;
-import java.sql.Connection;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.text.SimpleDateFormat;
-import java.util.*;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class PiwikStatsDB {
-
- private String logPath;
- private String logRepoPath;
- private String logPortalPath;
-
- private Statement stmt = null;
-
- private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
-
- private String CounterRobotsURL;
- private ArrayList robotsList;
-
- public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
- this.logRepoPath = logRepoPath;
- this.logPortalPath = logPortalPath;
-
- }
-
- public void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
- dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
-
- logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
- dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
-
- logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
-
- logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
- }
-
- public void recreateDBAndTables() throws Exception {
- this.createDatabase();
- this.createTables();
- // The piwiklog table is not needed since it is built
- // on top of JSON files
- this.createTmpTables();
- }
-
- public ArrayList getRobotsList() {
- return robotsList;
- }
-
- public void setRobotsList(ArrayList robotsList) {
- this.robotsList = robotsList;
- }
-
- public String getCounterRobotsURL() {
- return CounterRobotsURL;
- }
-
- public void setCounterRobotsURL(String CounterRobotsURL) {
- this.CounterRobotsURL = CounterRobotsURL;
- }
-
- private void createDatabase() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
- String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
- stmt.executeUpdate(dropDatabase);
- } catch (Exception e) {
- logger.error("Failed to drop database: " + e);
- throw new Exception("Failed to drop database: " + e.toString(), e);
- }
-
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
- String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
- stmt.executeUpdate(createDatabase);
-
- } catch (Exception e) {
- logger.error("Failed to create database: " + e);
- throw new Exception("Failed to create database: " + e.toString(), e);
- }
- }
-
- private void createTables() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- // Create Piwiklog table - This table should exist
- String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) "
- + "into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTablePiwikLog);
-
- /////////////////////////////////////////
- // Rule for duplicate inserts @ piwiklog
- /////////////////////////////////////////
-
- String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTablePortalLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ process_portal_log
- //////////////////////////////////////////////////
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
-
- private void createTmpTables() throws Exception {
- try {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
- + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
- + "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ piwiklogtmp
- //////////////////////////////////////////////////
-
- //////////////////////////////////////////////////
- // Copy from public.piwiklog to piwiklog
- //////////////////////////////////////////////////
- // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
- // stmt.executeUpdate(sqlCopyPublicPiwiklog);
-
- String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTmpTablePortalLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ process_portal_log_tmp
- //////////////////////////////////////////////////
-
- stmt.close();
-
- } catch (Exception e) {
- logger.error("Failed to create tmptables: " + e);
- throw new Exception("Failed to create tmp tables: " + e.toString(), e);
- // System.exit(0);
- }
- }
-
- public void processLogs() throws Exception {
- try {
- ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
- this.robotsList = counterRobots.getRobotsPatterns();
-
- logger.info("Processing repository logs");
- processRepositoryLog();
- logger.info("Repository logs process done");
-
- logger.info("Removing double clicks");
- removeDoubleClicks();
- logger.info("Removing double clicks done");
-
- logger.info("Cleaning oai");
- cleanOAI();
- logger.info("Cleaning oai done");
-
- logger.info("Processing portal logs");
- processPortalLog();
- logger.info("Portal logs process done");
-
- logger.info("Processing portal usagestats");
- portalStats();
- logger.info("Portal usagestats process done");
-
- logger.info("ViewsStats processing starts");
- viewsStats();
- logger.info("ViewsStats processing ends");
-
- logger.info("DownloadsStats processing starts");
- downloadsStats();
- logger.info("DownloadsStats processing starts");
-
- logger.info("Updating Production Tables");
- updateProdTables();
- logger.info("Updated Production Tables");
-
- } catch (Exception e) {
- logger.error("Failed to process logs: " + e);
- throw new Exception("Failed to process logs: " + e.toString(), e);
- }
- }
-
- public void processRepositoryLog() throws Exception {
-
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
-
- logger.info("Dropping piwiklogtmp_json table");
- String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp_json";
- stmt.executeUpdate(drop_piwiklogtmp_json);
- logger.info("Dropped piwiklogtmp_json table");
-
- logger.info("Creating piwiklogtmp_json");
- String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp_json(\n" +
- " `idSite` STRING,\n" +
- " `idVisit` STRING,\n" +
- " `country` STRING,\n" +
- " `referrerName` STRING,\n" +
- " `browser` STRING,\n" +
- " `actionDetails` ARRAY<\n" +
- " struct<\n" +
- " type: STRING,\n" +
- " url: STRING,\n" +
- " `customVariables`: struct<\n" +
- " `1`: struct<\n" +
- " `customVariablePageValue1`: STRING\n" +
- " >\n" +
- " >,\n" +
- " timestamp: String\n" +
- " >\n" +
- " >\n" +
- ")\n" +
- "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
- "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" +
- "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_piwiklogtmp_json);
- logger.info("Created piwiklogtmp_json");
-
- logger.info("Dropping piwiklogtmp table");
- String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp";
- stmt.executeUpdate(drop_piwiklogtmp);
- logger.info("Dropped piwiklogtmp");
-
- logger.info("Creating piwiklogtmp");
- String create_piwiklogtmp = "CREATE TABLE " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " +
- "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(create_piwiklogtmp);
- logger.info("Created piwiklogtmp");
-
- logger.info("Inserting into piwiklogtmp");
- String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " +
- "actiondetail.type as action, actiondetail.url as url, " +
- "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
- "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
- "referrerName as referrer_name, browser as agent\n" +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" +
- "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
- stmt.executeUpdate(insert_piwiklogtmp);
- logger.info("Inserted into piwiklogtmp");
-
- stmt.close();
- }
-
- public void removeDoubleClicks() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Cleaning download double clicks");
- // clean download double clicks
- String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "WHERE EXISTS (\n" +
- "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
- ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
- "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
- +
- "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" +
- "AND p1.timestamp\n" +
- " >\n" +
- ")\n" +
- "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
- "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" +
- "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_process_portal_log_tmp_json);
- logger.info("Created process_portal_log_tmp_json");
-
- logger.info("Droping process_portal_log_tmp table");
- String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".process_portal_log_tmp";
- stmt.executeUpdate(drop_process_portal_log_tmp);
- logger.info("Dropped process_portal_log_tmp");
-
- logger.info("Creating process_portal_log_tmp");
- String create_process_portal_log_tmp = "CREATE TABLE " +
- ConnectDB.getUsageStatsDBSchema() +
- ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " +
- "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(create_process_portal_log_tmp);
- logger.info("Created process_portal_log_tmp");
-
- logger.info("Inserting into process_portal_log_tmp");
- String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".process_portal_log_tmp " +
- "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
- +
- "actiondetail.url as url, " +
- "CASE\n" +
- " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " +
- " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " +
- " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
- +
- " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " +
- " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " +
- " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " +
- " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " +
- " ELSE '' " +
- "END AS entity_id, " +
- "CASE " +
- " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " +
- " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " +
- " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " +
- " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " +
- " ELSE '' " +
- "END AS source_item_type, " +
- "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " +
- "browser as agent " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " +
- "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
- stmt.executeUpdate(insert_process_portal_log_tmp);
- logger.info("Inserted into process_portal_log_tmp");
-
- stmt.close();
- }
-
- public void portalStats() throws SQLException {
- Connection con = ConnectDB.getHiveConnection();
- Statement stmt = con.createStatement();
- con.setAutoCommit(false);
-
-// Original queries where of the style
-//
-// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp2,
-// openaire_prod_stats_20200821.result_oids roid
-// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null
-//
-// The following query is an example of how queries should be
-//
-//
-// INSERT INTO usagestats_20200907.piwiklogtmp
-// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp
-// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id
-// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL);
-//
-// We should consider if we would like the queries to be as the following
-//
-// INSERT INTO usagestats_20200907.piwiklogtmp
-// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp
-// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id
-// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND
-// roid.oid != '');
-
- logger.info("PortalStats - Step 1");
- String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
- + ".result_oids roid WHERE roid.id IS NOT NULL)";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("PortalStats - Step 2");
- stmt = con.createStatement();
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
- + ".datasource_oids roid WHERE roid.id IS NOT NULL)";
- stmt.executeUpdate(sql);
- stmt.close();
-
- /*
- * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " +
- * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent "
- * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() +
- * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close();
- */
- logger.info("PortalStats - Step 3");
- stmt = con.createStatement();
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
- + ".project_oids roid WHERE roid.id IS NOT NULL)";
- stmt.executeUpdate(sql);
- stmt.close();
-
- con.close();
- }
-
- private void cleanOAI() throws Exception {
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Cleaning oai - Step 1");
- stmt = ConnectDB.getHiveConnection().createStatement();
- String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," +
- "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 2");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," +
- "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 3");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," +
- "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 4");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," +
- "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 5");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," +
- "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 6");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," +
- "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 7");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," +
- "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 8");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," +
- "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 9");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," +
- "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 10");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," +
- "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 11");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," +
- "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 12");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," +
- "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 13");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," +
- "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 14");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," +
- "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 15");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," +
- "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 16");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," +
- "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 17");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," +
- "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 18");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," +
- "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 19");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," +
- "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 20");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," +
- "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 21");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," +
- "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 22");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," +
- "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 23");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," +
- "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 24");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," +
- "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 25");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," +
- "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 26");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," +
- "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 27");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," +
- "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 28");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," +
- "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Step 29");
- stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," +
- "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
- stmt.executeUpdate(sql);
- stmt.close();
-
- logger.info("Cleaning oai - Done, closing connection");
- ConnectDB.getHiveConnection().close();
- }
-
- private String processPortalURL(String url) {
-
- if (url.indexOf("explore.openaire.eu") > 0) {
- try {
- url = URLDecoder.decode(url, "UTF-8");
- } catch (Exception e) {
- logger.info("Error when decoding the following URL: " + url);
- }
- if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
- url = "datasource|"
- + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
- } else if (url.indexOf("datasource=") > 0
- && url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
- url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
- } else if (url.indexOf("datasourceFilter=") > 0
- && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
- url = "datasource|"
- + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
- } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
- url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
- } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
- url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
- } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46
- && !url.contains("oai:dnet:corda")) {
- url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
- } else if (url.indexOf("organizationId=") > 0
- && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
- url = "organization|"
- + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
- } else {
- url = "";
- }
- } else {
- url = "";
- }
-
- return url;
- }
-
- private void updateProdTables() throws SQLException {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Inserting data to piwiklog");
- String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
- stmt.executeUpdate(sql);
-
- logger.info("Inserting data to views_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp";
- stmt.executeUpdate(sql);
-
- logger.info("Inserting data to downloads_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp";
- stmt.executeUpdate(sql);
-
- logger.info("Inserting data to pageviews_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp";
- stmt.executeUpdate(sql);
-
- logger.info("Creating usage_stats table");
- String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
- "AS SELECT coalesce(ds.source, vs.source) as source, " +
- "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
- "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
- "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
- "coalesce(ds.openaire, 0) as openaire_downloads, " +
- "coalesce(vs.openaire, 0) as openaire_views " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
- ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
- "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
- stmt.executeUpdate(createUsageStats);
- logger.info("Created usage_stats table");
-
-
- /*
- * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
- * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql);
- * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
- * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql);
- * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " +
- * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql);
- * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " +
- * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql);
- */
- stmt.close();
- ConnectDB.getHiveConnection().close();
-
- }
-
- private ArrayList listHdfsDir(String dir) throws Exception {
-
- FileSystem hdfs = FileSystem.get(new Configuration());
- RemoteIterator Files;
- ArrayList fileNames = new ArrayList<>();
-
- try {
- Path exportPath = new Path(hdfs.getUri() + dir);
- Files = hdfs.listFiles(exportPath, false);
- while (Files.hasNext()) {
- String fileName = Files.next().getPath().toString();
- fileNames.add(fileName);
- }
-
- hdfs.close();
- } catch (Exception e) {
- logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath));
- throw new Exception("HDFS file path with exported data does not exist : " + logPath, e);
- }
-
- return fileNames;
- }
-
- private String readHDFSFile(String filename) throws Exception {
- String result;
- try {
-
- FileSystem fs = FileSystem.get(new Configuration());
- // log.info("reading file : " + filename);
-
- BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
-
- StringBuilder sb = new StringBuilder();
- String line = br.readLine();
-
- while (line != null) {
- if (!line.equals("[]")) {
- sb.append(line);
- }
- // sb.append(line);
- line = br.readLine();
- }
- result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
- if (result.equals("")) {
- result = "[]";
- }
-
- // fs.close();
- } catch (Exception e) {
- logger.error(e.getMessage());
- throw new Exception(e);
- }
-
- return result;
- }
-
- private Connection getConnection() throws SQLException {
- return ConnectDB.getHiveConnection();
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java
deleted file mode 100644
index 1708a1c64..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-
-import org.json.JSONException;
-import org.json.simple.JSONArray;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
-
-public class ReadCounterRobotsList {
-
- private ArrayList robotsPatterns = new ArrayList();
- private String COUNTER_ROBOTS_URL;
-
- public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException {
- COUNTER_ROBOTS_URL = url;
- robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL);
- }
-
- private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException {
- InputStream is = new URL(url).openStream();
- JSONParser parser = new JSONParser();
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1")));
- JSONArray jsonArray = (JSONArray) parser.parse(reader);
- for (Object aJsonArray : jsonArray) {
- org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray;
- robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\"));
- }
- return robotsPatterns;
- }
-
- public ArrayList getRobotsPatterns() {
- return robotsPatterns;
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java
deleted file mode 100644
index 06e350c9e..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java
+++ /dev/null
@@ -1,575 +0,0 @@
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.*;
-// import java.io.BufferedReader;
-// import java.io.InputStreamReader;
-import java.net.URL;
-import java.net.URLConnection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author D. Pierrakos, S. Zoupanos
- */
-public class SarcStats {
-
- private Statement stmtHive = null;
- private Statement stmtImpala = null;
-
- private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
-
- public SarcStats() throws Exception {
-// createTables();
- }
-
- private void createTables() throws Exception {
- try {
-
- stmtHive = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
- stmtHive.executeUpdate(sqlCreateTableSushiLog);
-
- // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
- // stmt.executeUpdate(sqlCopyPublicSushiLog);
- String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
- + " ON INSERT TO sushilog "
- + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
- + "sushilog.rid, sushilog.date "
- + "FROM sushilog "
- + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
- stmtHive.executeUpdate(sqlcreateRuleSushiLog);
- String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
- stmtHive.executeUpdate(createSushiIndex);
-
- stmtHive.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Sushi Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
-
- public void reCreateLogDirs() throws IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
- dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
-
- logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
- dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
-
- logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
- dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
-
- logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
- dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
- }
-
- public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
-
- logger.info("Dropping sarc_sushilogtmp_json_array table");
- String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
- stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
- logger.info("Dropped sarc_sushilogtmp_json_array table");
-
- logger.info("Creating sarc_sushilogtmp_json_array table");
- String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
- + " `ItemIdentifier` ARRAY<\n"
- + " struct<\n"
- + " `Type`: STRING,\n"
- + " `Value`: STRING\n"
- + " >\n"
- + " >,\n"
- + " `ItemPerformance` struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >\n"
- + ")"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + sarcsReportPathArray + "/'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
- logger.info("Created sarc_sushilogtmp_json_array table");
-
- logger.info("Dropping sarc_sushilogtmp_json_non_array table");
- String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_json_non_array";
- stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
- logger.info("Dropped sarc_sushilogtmp_json_non_array table");
-
- logger.info("Creating sarc_sushilogtmp_json_non_array table");
- String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
- + " `ItemIdentifier` struct<\n"
- + " `Type`: STRING,\n"
- + " `Value`: STRING\n"
- + " >,\n"
- + " `ItemPerformance` struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >"
- + ")"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
- logger.info("Created sarc_sushilogtmp_json_non_array table");
-
- logger.info("Creating sarc_sushilogtmp table");
- String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp(source STRING, repository STRING, "
- + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
- + "tblproperties('transactional'='true')";
- stmt.executeUpdate(create_sarc_sushilogtmp);
- logger.info("Created sarc_sushilogtmp table");
-
- logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
- String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
- + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
- + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
- + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
- + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
- + "WHERE `ItemIdent`.`Type`='DOI'";
- stmt.executeUpdate(insert_sarc_sushilogtmp);
- logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
-
- logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
- insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
- + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
- + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
- + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
- stmt.executeUpdate(insert_sarc_sushilogtmp);
- logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
-
- ConnectDB.getHiveConnection().close();
- }
-
- public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
-
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmt.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
-
- logger.info("Dropping sarc_sushilogtmp table");
- String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp";
- stmt.executeUpdate(drop_sarc_sushilogtmp);
- logger.info("Dropped sarc_sushilogtmp table");
- ConnectDB.getHiveConnection().close();
-
- List issnAndUrls = new ArrayList();
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
- });
- issnAndUrls.add(new String[]{
- "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
- });
-
- if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
- && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
- issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
- }
-
- logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
-
- for (String[] issnAndUrl : issnAndUrls) {
- logger.info("Now working on ISSN: " + issnAndUrl[1]);
- getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
- }
-
- }
-
- public void finalizeSarcStats() throws Exception {
- stmtHive = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
- stmtImpala = ConnectDB.getImpalaConnection().createStatement();
-
- logger.info("Creating downloads_stats table_tmp");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_tmp "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmtHive.executeUpdate(createDownloadsStats);
- logger.info("Created downloads_stats_tmp table");
-
- logger.info("Dropping sarc_sushilogtmp_impala table");
- String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala";
- stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala);
- logger.info("Dropped sarc_sushilogtmp_impala table");
-
- logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala");
- String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala "
- + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
- stmtHive.executeUpdate(createSarcSushilogtmpImpala);
- logger.info("Created sarc_sushilogtmp_impala");
-
- logger.info("Making sarc_sushilogtmp visible to impala");
- String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala;";
- stmtImpala.executeUpdate(invalidateMetadata);
-
- logger.info("Dropping downloads_stats_impala table");
- String drop_downloads_stats_impala = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala";
- stmtHive.executeUpdate(drop_downloads_stats_impala);
- logger.info("Dropped downloads_stats_impala table");
-
- logger.info("Making downloads_stats_impala deletion visible to impala");
- try {
- String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala;";
- stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala);
- } catch (SQLException sqle) {
- }
-
- // We run the following query in Impala because it is faster
- logger.info("Creating downloads_stats_impala");
- String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala AS "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
- + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_pids ro "
- + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
- + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
- stmtImpala.executeUpdate(createDownloadsStatsImpala);
- logger.info("Creating downloads_stats_impala");
-
- // Insert into downloads_stats
- logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp");
- String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_tmp SELECT * "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala";
- stmtHive.executeUpdate(insertDStats);
- logger.info("Inserted into downloads_stats_tmp");
-
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmtHive.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
-
- // Insert into sushilog
- logger.info("Inserting into sushilog");
- String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
- stmtHive.executeUpdate(insertSushiLog);
- logger.info("Inserted into sushilog");
-
- stmtHive.close();
- ConnectDB.getHiveConnection().close();
- }
-
- public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
- String url, String issn) throws Exception {
- logger.info("Processing SARC! issn: " + issn + " with url: " + url);
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
-
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
-
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
- st.setString(1, issn);
- ResultSet rs_date = st.executeQuery();
- Date dateMax = null;
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
-
- // Creating the needed configuration for the correct storing of data
- Configuration config = new Configuration();
- config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
- config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
- config
- .set(
- "fs.hdfs.impl",
- org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- config
- .set(
- "fs.file.impl",
- org.apache.hadoop.fs.LocalFileSystem.class.getName());
- FileSystem dfs = FileSystem.get(config);
-
- if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
- } else {
-
- while (start.before(end)) {
- String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
- + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
- start.add(Calendar.MONTH, 1);
-
- logger.info("(getARReport) Getting report: " + reportUrl);
- String text = getJson(reportUrl);
- if (text == null) {
- continue;
- }
-
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = null;
- try {
- jsonObject = (JSONObject) parser.parse(text);
- } // if there is a parsing error continue with the next url
- catch (ParseException pe) {
- continue;
- }
-
- jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("sc:Report");
- if (jsonObject == null) {
- continue;
- }
- jsonObject = (JSONObject) jsonObject.get("c:Report");
- jsonObject = (JSONObject) jsonObject.get("c:Customer");
- Object obj = jsonObject.get("c:ReportItems");
- JSONArray jsonArray = new JSONArray();
- if (obj instanceof JSONObject) {
- jsonArray.add(obj);
- } else {
- jsonArray = (JSONArray) obj;
- // jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
- }
- if (jsonArray == null) {
- continue;
- }
-
- // Creating the file in the filesystem for the ItemIdentifier as array object
- String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
- + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePathArray);
- FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
-
- // Creating the file in the filesystem for the ItemIdentifier as array object
- String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
- + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePathNonArray);
- FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
-
- for (Object aJsonArray : jsonArray) {
-
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- renameKeysRecursively(":", jsonObjectRow);
-
- if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
- finNonArray.write(jsonObjectRow.toJSONString().getBytes());
- finNonArray.writeChar('\n');
- } else {
- finArray.write(jsonObjectRow.toJSONString().getBytes());
- finArray.writeChar('\n');
- }
- }
-
- finArray.close();
- finNonArray.close();
-
- // Check the file size and if it is too big, delete it
- File fileArray = new File(filePathArray);
- if (fileArray.length() == 0)
- fileArray.delete();
- File fileNonArray = new File(filePathNonArray);
- if (fileNonArray.length() == 0)
- fileNonArray.delete();
-
- }
-
- dfs.close();
- }
- //ConnectDB.getHiveConnection().close();
- }
-
- private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
- for (Object jjval : givenJsonObj) {
- if (jjval instanceof JSONArray) {
- renameKeysRecursively(delimiter, (JSONArray) jjval);
- } else if (jjval instanceof JSONObject) {
- renameKeysRecursively(delimiter, (JSONObject) jjval);
- } // All other types of vals
- else
- ;
- }
- }
-
- private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
- Set jkeys = new HashSet(givenJsonObj.keySet());
- for (String jkey : jkeys) {
-
- String[] splitArray = jkey.split(delimiter);
- String newJkey = splitArray[splitArray.length - 1];
-
- Object jval = givenJsonObj.get(jkey);
- givenJsonObj.remove(jkey);
- givenJsonObj.put(newJkey, jval);
-
- if (jval instanceof JSONObject) {
- renameKeysRecursively(delimiter, (JSONObject) jval);
- }
-
- if (jval instanceof JSONArray) {
- renameKeysRecursively(delimiter, (JSONArray) jval);
- }
- }
- }
-
- private String getJson(String url) throws Exception {
- // String cred=username+":"+password;
- // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- response.append("\n");
- }
- }
- return response.toString();
- } catch (Exception e) {
-
- // Logging error and silently continuing
- logger.error("Failed to get URL: " + e);
- System.out.println("Failed to get URL: " + e);
-// return null;
-// throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- return "";
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
deleted file mode 100644
index 405b58bd5..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java
+++ /dev/null
@@ -1,179 +0,0 @@
-
-package eu.dnetlib.oa.graph.usagestats.export;
-
-import java.io.IOException;
-import java.sql.SQLException;
-import java.sql.Statement;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Main class for downloading and processing Usage statistics
- *
- * @author D. Pierrakos, S. Zoupanos
- */
-public class UsageStatsExporter {
-
- public UsageStatsExporter() {
-
- }
-
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
-
- private void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
-
- logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
- dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
-
- logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
- dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
-
- logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
-
- logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
-
- logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
-
- logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
- }
-
- public void export() throws Exception {
-
- logger.info("Initialising DB properties");
- ConnectDB.init();
-
-// runImpalaQuery();
-
- PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
-
- logger.info("Re-creating database and tables");
- if (ExecuteWorkflow.recreateDbAndTables)
- piwikstatsdb.recreateDBAndTables();
- ;
-
- logger.info("Initializing the download logs module");
- PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
-
- if (ExecuteWorkflow.piwikEmptyDirs) {
- logger.info("Recreating Piwik log directories");
- piwikstatsdb.reCreateLogDirs();
- }
-
- // Downloading piwik logs (also managing directory creation)
- if (ExecuteWorkflow.downloadPiwikLogs) {
- logger.info("Downloading piwik logs");
- piwd
- .GetOpenAIRELogs(
- ExecuteWorkflow.repoLogPath,
- ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
- }
- logger.info("Downloaded piwik logs");
-
- // Create DB tables, insert/update statistics
- String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
- piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
-
- if (ExecuteWorkflow.processPiwikLogs) {
- logger.info("Processing logs");
- piwikstatsdb.processLogs();
- }
-
- logger.info("Creating LaReferencia tables");
- LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
- ExecuteWorkflow.lareferenciaAuthToken);
-
- if (ExecuteWorkflow.laReferenciaEmptyDirs) {
- logger.info("Recreating LaReferencia log directories");
- lrf.reCreateLogDirs();
- }
-
- if (ExecuteWorkflow.downloadLaReferenciaLogs) {
- logger.info("Downloading LaReferencia logs");
- lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
- logger.info("Downloaded LaReferencia logs");
- }
- LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
-
- if (ExecuteWorkflow.processLaReferenciaLogs) {
- logger.info("Processing LaReferencia logs");
- lastats.processLogs();
- logger.info("LaReferencia logs done");
- }
-
- IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
- if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
- logger.info("Creating Irus Stats tables");
- irusstats.createTables();
- logger.info("Created Irus Stats tables");
-
- logger.info("Re-create log dirs");
- irusstats.reCreateLogDirs();
- logger.info("Re-created log dirs");
- }
-
- if (ExecuteWorkflow.irusDownloadReports) {
- irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
- }
- if (ExecuteWorkflow.irusProcessStats) {
- irusstats.processIrusStats();
- logger.info("Irus done");
- }
-
- SarcStats sarcStats = new SarcStats();
- if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
- sarcStats.reCreateLogDirs();
- }
- if (ExecuteWorkflow.sarcDownloadReports) {
- sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- }
- if (ExecuteWorkflow.sarcProcessStats) {
- sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- sarcStats.finalizeSarcStats();
- }
- logger.info("Sarc done");
-
- // finalize usagestats
- if (ExecuteWorkflow.finalizeStats) {
- piwikstatsdb.finalizeStats();
- logger.info("Finalized stats");
- }
-
- // Make the tables available to Impala
- if (ExecuteWorkflow.finalTablesVisibleToImpala) {
- logger.info("Making tables visible to Impala");
- invalidateMetadata();
- }
-
- logger.info("End");
- }
-
- private void invalidateMetadata() throws SQLException {
- Statement stmt = null;
-
- stmt = ConnectDB.getImpalaConnection().createStatement();
-
- String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
- stmt.executeUpdate(sql);
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- }
-}
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json
deleted file mode 100644
index 988c23b48..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json
+++ /dev/null
@@ -1,231 +0,0 @@
-[
- {
- "paramName": "mat",
- "paramLongName": "matomoAuthToken",
- "paramDescription": "when true will stop SparkSession after job execution",
- "paramRequired": false
- },
- {
- "paramName": "mbu",
- "paramLongName": "matomoBaseURL",
- "paramDescription": "URL of the isLookUp Service",
- "paramRequired": true
- },
- {
- "paramName": "rlp",
- "paramLongName": "repoLogPath",
- "paramDescription": "nameNode of the source cluster",
- "paramRequired": true
- },
- {
- "paramName": "plp",
- "paramLongName": "portalLogPath",
- "paramDescription": "namoNode of the target cluster",
- "paramRequired": true
- },
- {
- "paramName": "pmi",
- "paramLongName": "portalMatomoID",
- "paramDescription": "namoNode of the target cluster",
- "paramRequired": true
- },
- {
- "paramName": "iukbuw",
- "paramLongName": "irusUKBaseURL",
- "paramDescription": "working directory",
- "paramRequired": true
- },
- {
- "paramName": "iukrp",
- "paramLongName": "irusUKReportPath",
- "paramDescription": "maximum number of map tasks used in the distcp process",
- "paramRequired": true
- },
- {
- "paramName": "srpa",
- "paramLongName": "sarcsReportPathArray",
- "paramDescription": "memory for distcp action copying actionsets from remote cluster",
- "paramRequired": true
- },
- {
- "paramName": "srpna",
- "paramLongName": "sarcsReportPathNonArray",
- "paramDescription": "timeout for distcp copying actions from remote cluster",
- "paramRequired": true
- },
- {
- "paramName": "llp",
- "paramLongName": "lareferenciaLogPath",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "lbu",
- "paramLongName": "lareferenciaBaseURL",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "lat",
- "paramLongName": "lareferenciaAuthToken",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dbhu",
- "paramLongName": "dbHiveUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dbiu",
- "paramLongName": "dbImpalaUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "usdbs",
- "paramLongName": "usageStatsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "sdbs",
- "paramLongName": "statsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "rdbt",
- "paramLongName": "recreateDbAndTables",
- "paramDescription": "Re-create database and initial tables?",
- "paramRequired": true
- },
- {
- "paramName": "pwed",
- "paramLongName": "piwikEmptyDirs",
- "paramDescription": "Empty piwik directories?",
- "paramRequired": true
- },
- {
- "paramName": "ppwl",
- "paramLongName": "processPiwikLogs",
- "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
- "paramRequired": true
- },
- {
- "paramName": "dpwl",
- "paramLongName": "downloadPiwikLogs",
- "paramDescription": "download piwik logs?",
- "paramRequired": true
- },
- {
- "paramName": "slp",
- "paramLongName": "startingLogPeriod",
- "paramDescription": "Starting log period",
- "paramRequired": true
- },
- {
- "paramName": "elp",
- "paramLongName": "endingLogPeriod",
- "paramDescription": "Ending log period",
- "paramRequired": true
- },
- {
- "paramName": "npidd",
- "paramLongName": "numberOfPiwikIdsToDownload",
- "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "nsidd",
- "paramLongName": "numberOfSiteIdsToDownload",
- "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "lerd",
- "paramLongName": "laReferenciaEmptyDirs",
- "paramDescription": "Empty LaReferencia directories?",
- "paramRequired": true
- },
- {
- "paramName": "plrl",
- "paramLongName": "processLaReferenciaLogs",
- "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
- "paramRequired": true
- },
- {
- "paramName": "dlrl",
- "paramLongName": "downloadLaReferenciaLogs",
- "paramDescription": "download La Referencia logs?",
- "paramRequired": true
- },
- {
- "paramName": "icted",
- "paramLongName": "irusCreateTablesEmptyDirs",
- "paramDescription": "Irus section: Create tables and empty JSON directories?",
- "paramRequired": true
- },
- {
- "paramName": "idr",
- "paramLongName": "irusDownloadReports",
- "paramDescription": "Irus section: Download reports?",
- "paramRequired": true
- },
- {
- "paramName": "ipr",
- "paramLongName": "irusProcessStats",
- "paramDescription": "Irus section: Process stats?",
- "paramRequired": true
- },
- {
- "paramName": "inod",
- "paramLongName": "irusNumberOfOpendoarsToDownload",
- "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "icted",
- "paramLongName": "sarcCreateTablesEmptyDirs",
- "paramDescription": "Sarc section: Create tables and empty JSON directories?",
- "paramRequired": true
- },
- {
- "paramName": "idr",
- "paramLongName": "sarcDownloadReports",
- "paramDescription": "Sarc section: Download reports?",
- "paramRequired": true
- },
- {
- "paramName": "ipr",
- "paramLongName": "sarcProcessStats",
- "paramDescription": "Sarc section: Process stats?",
- "paramRequired": true
- },
- {
- "paramName": "inod",
- "paramLongName": "sarcNumberOfIssnToDownload",
- "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
- "paramRequired": true
- },
-
- {
- "paramName": "fs",
- "paramLongName": "finalizeStats",
- "paramDescription": "Create the usage_stats table?",
- "paramRequired": true
- },
- {
- "paramName": "ftvi",
- "paramLongName": "finalTablesVisibleToImpala",
- "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
- "paramRequired": true
- },
- {
- "paramName": "nodt",
- "paramLongName": "numberOfDownloadThreads",
- "paramDescription": "Number of download threads",
- "paramRequired": true
- }
-]
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml
deleted file mode 100644
index b5c807378..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
- jobTracker
- ${jobTracker}
-
-
- nameNode
- ${nameNode}
-
-
- oozie.use.system.libpath
- true
-
-
- oozie.action.sharelib.for.spark
- spark2
-
-
- hiveMetastoreUris
- thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
-
-
- hiveJdbcUrl
- jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1
-
-
- impalaJdbcUrl
- jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl;
-
-
- oozie.wf.workflow.notification.url
- {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status
-
-
- oozie.use.system.libpath
- true
-
-
diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml
deleted file mode 100644
index 8d62a85a9..000000000
--- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml
+++ /dev/null
@@ -1,90 +0,0 @@
-
-
-
- hiveMetastoreUris
- Hive server metastore URIs
-
-
- hiveJdbcUrl
- Hive server jdbc url
-
-
- impalaJdbcUrl
- Impala server jdbc url
-
-
-
-
- ${jobTracker}
- ${nameNode}
-
-
- hive.metastore.uris
- ${hiveMetastoreUris}
-
-
- mapreduce.job.queuename
- ${queueName}
-
-
- oozie.launcher.mapred.job.queue.name
- ${oozieLauncherQueueName}
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
- eu.dnetlib.oa.graph.usagestats.export.ExecuteWorkflow
- --matomoAuthToken${matomoAuthToken}
- --matomoBaseURL${matomoBaseURL}
- --repoLogPath${repoLogPath}
- --portalLogPath${portalLogPath}
- --portalMatomoID${portalMatomoID}
- --irusUKBaseURL${irusUKBaseURL}
- --irusUKReportPath${irusUKReportPath}
- --sarcsReportPathArray${sarcsReportPathArray}
- --sarcsReportPathNonArray${sarcsReportPathNonArray}
- --lareferenciaLogPath${lareferenciaLogPath}
- --lareferenciaBaseURL${lareferenciaBaseURL}
- --lareferenciaAuthToken${lareferenciaAuthToken}
- --dbHiveUrl${hiveJdbcUrl}
- --dbImpalaUrl${impalaJdbcUrl}
- --usageStatsDBSchema${usageStatsDBSchema}
- --statsDBSchema${statsDBSchema}
- --recreateDbAndTables${recreateDbAndTables}
- --piwikEmptyDirs${piwikEmptyDirs}
- --downloadPiwikLogs${downloadPiwikLogs}
- --processPiwikLogs${processPiwikLogs}
- --startingLogPeriod${startingLogPeriod}
- --endingLogPeriod${endingLogPeriod}
- --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload}
- --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload}
- --laReferenciaEmptyDirs${laReferenciaEmptyDirs}
- --downloadLaReferenciaLogs${downloadLaReferenciaLogs}
- --processLaReferenciaLogs${processLaReferenciaLogs}
- --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs}
- --irusDownloadReports${irusDownloadReports}
- --irusProcessStats${irusProcessStats}
- --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload}
- --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs}
- --sarcDownloadReports${sarcDownloadReports}
- --sarcProcessStats${sarcProcessStats}
- --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
- --finalizeStats${finalizeStats}
- --finalTablesVisibleToImpala${finalTablesVisibleToImpala}
- --numberOfDownloadThreads${numberOfDownloadThreads}
-
-
-
-
-
-
-
-