diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml
new file mode 100644
index 000000000..a65c4514a
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml
@@ -0,0 +1,18 @@
+
+
+
+
+
+ JDK_1.8
+
+
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
index 14b543a57..5593d4d87 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml
@@ -23,7 +23,35 @@
4.0.0
dhp-usage-datasets-stats-update
-
+
+
+
+ pl.project13.maven
+ git-commit-id-plugin
+ 2.1.15
+
+
+
+ revision
+
+
+
+
+ ${project.basedir}/../.git
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.6.1
+
+
+ 1.8
+
+
+
+
UTF-8
UTF-8
@@ -68,6 +96,11 @@
dhp-common
${project.version}
+
+ com.mchange
+ c3p0
+ 0.9.5.2
+
c3p0
c3p0
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh
new file mode 100755
index 000000000..9b4325508
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh
@@ -0,0 +1 @@
+mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
index e6da7eff3..25b30e8ad 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java
@@ -32,8 +32,8 @@ public abstract class ConnectDB {
private static String datasetUsageStatsDBSchema;
private static String statsDBSchema;
private final static Logger logger = Logger.getLogger(ConnectDB.class);
- private Statement stmt = null;
-
+ private Statement stmt = null;
+
static void init() throws ClassNotFoundException {
dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
@@ -79,6 +79,7 @@ public abstract class ConnectDB {
*/
ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbHiveUrl);
+ cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1);
@@ -93,10 +94,10 @@ public abstract class ConnectDB {
cpds.setCheckoutTimeout(0);
cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60);
-
- logger.info("Opened database successfully");
- return cpds.getConnection();
+ logger.info("Opened database successfully");
+
+ return cpds.getConnection();
}
@@ -107,6 +108,7 @@ public abstract class ConnectDB {
*/
ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbImpalaUrl);
+ cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1);
@@ -122,81 +124,8 @@ public abstract class ConnectDB {
cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60);
- logger.info("Opened database successfully");
+ logger.info("Opened database successfully");
return cpds.getConnection();
}
-
- private void createDatabase() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
- stmt.executeUpdate(dropDatabase);
- } catch (Exception e) {
- logger.error("Failed to drop database: " + e);
- throw new Exception("Failed to drop database: " + e.toString(), e);
- }
-
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
- stmt.executeUpdate(createDatabase);
-
- } catch (Exception e) {
- logger.error("Failed to create database: " + e);
- throw new Exception("Failed to create database: " + e.toString(), e);
- }
- }
-
- private void createTables() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- // Create Piwiklog table - This table should exist
- String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) "
- + "into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTablePiwikLog);
-
- /////////////////////////////////////////
- // Rule for duplicate inserts @ piwiklog
- /////////////////////////////////////////
-
- String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTablePortalLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ process_portal_log
- //////////////////////////////////////////////////
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
}
-/*
-CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING,
- name STRING,
- source STRING,
- release STRING,
- createdby STRING,
- report_end_date STRING,
- report_start_date STRING)
- CLUSTERED BY (reportid)
- into 100 buckets stored as orc tblproperties('transactional'='true');
-*/
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java
new file mode 100644
index 000000000..88db1f819
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java
@@ -0,0 +1,168 @@
+
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class DatasetsStatsDB {
+
+ private String logPath;
+ private String logRepoPath;
+ private String logPortalPath;
+
+ private Statement stmt = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class);
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.logPortalPath = logPortalPath;
+
+ }
+
+ public void recreateDBAndTables() throws Exception {
+ this.createDatabase();
+ this.createTables();
+ }
+
+// public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+// FileSystem dfs = FileSystem.get(new Configuration());
+//
+// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
+// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
+//
+// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
+// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
+//
+// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
+// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
+//
+// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
+// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
+// }
+ public ArrayList getRobotsList() {
+ return robotsList;
+ }
+
+ public void setRobotsList(ArrayList robotsList) {
+ this.robotsList = robotsList;
+ }
+
+ public String getCounterRobotsURL() {
+ return CounterRobotsURL;
+ }
+
+ public void setCounterRobotsURL(String CounterRobotsURL) {
+ this.CounterRobotsURL = CounterRobotsURL;
+ }
+
+ private void createDatabase() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
+ stmt.executeUpdate(dropDatabase);
+ } catch (Exception e) {
+ logger.error("Failed to drop database: " + e);
+ throw new Exception("Failed to drop database: " + e.toString(), e);
+ }
+
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
+ }
+ }
+
+ private void createTables() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ // Create Reports table - This table should exist
+ logger.info("Creating Reports Table");
+ String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".datacitereports(reportid STRING, \n"
+ + " name STRING, \n"
+ + " source STRING,\n"
+ + " release STRING,\n"
+ + " createdby STRING,\n"
+ + " report_start_date STRING,\n"
+ + " report_end_date STRING)\n"
+ + " CLUSTERED BY (reportid)\n"
+ + " into 100 buckets stored as orc tblproperties('transactional'='true')";
+
+ stmt.executeUpdate(sqlCreateTableDataciteReports);
+ logger.info("Reports Table Created");
+
+ // Create Datasets Table
+ logger.info("Creating DataSets Table");
+ String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".datasets(ds_type STRING,\n"
+ + " ds_title STRING,\n"
+ + " yop STRING,\n"
+ + " uri STRING,\n"
+ + " platform STRING,\n"
+ + " data_type STRING,\n"
+ + " publisher STRING,\n"
+ + " publisher_id_type STRING,\n"
+ + " publisher_id_value STRING,\n"
+ + " ds_dates_type STRING,\n"
+ + " ds_pub_date STRING,\n"
+ + " ds_contributors STRING,\n"
+ // + " ds_contributor_value array ,\n"
+ + " reportid STRING)\n"
+ + " CLUSTERED BY (ds_type)\n"
+ + " into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableDataSets);
+ logger.info("DataSets Table Created");
+
+ // Create Datasets Performance Table
+ logger.info("Creating DataSetsPerformance Table");
+ String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".datasetsperformance(ds_type STRING,\n"
+ + " period_end STRING,\n"
+ + " period_from STRING,\n"
+ + " access_method STRING,\n"
+ + " metric_type STRING,\n"
+ + " count INT,\n"
+ + " country_counts STRING,\n"
+ + " reportid STRING)\n"
+ + " CLUSTERED BY (ds_type)\n"
+ + " into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableDataSetsPerformance);
+ logger.info("DataSetsPerformance Table Created");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ private Connection getConnection() throws SQLException {
+ return ConnectDB.getHiveConnection();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
index 196238ea2..a73b299ec 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java
@@ -1,97 +1,102 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-package eu.dnetlib.oa.graph.datasetsusagestats.export;
-
-import com.google.gson.Gson;
-import com.google.gson.JsonArray;
-import com.google.gson.JsonElement;
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import com.google.gson.JsonObject;
-import java.util.ArrayList;
-import java.util.Iterator;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.json.simple.parser.ParseException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- *
- * @author dpie
- */
-public class DownloadReportsListFromDatacite {
-
- private String dataciteBaseURL;
- private String dataciteReportPath;
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
-
- public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception {
-
- this.dataciteBaseURL = dataciteBaseURL;
- this.dataciteReportPath = dataciteReportPath;
- }
-
- public void downloadReportsList() throws ParseException {
- StringBuilder responseStrBuilder = new StringBuilder();
-
- Gson gson = new Gson();
-
- try {
- BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream());
- logger.info("Downloading from " + dataciteBaseURL);
-
- BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- String inputStr;
-
- while ((inputStr = streamReader.readLine()) != null) {
- responseStrBuilder.append(inputStr);
- }
- } catch (IOException e) {
- logger.info(e.getMessage());
- }
- JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class);
- JsonArray dataArray = jsonObject.getAsJsonArray("reports");
- ArrayList reportsList = new ArrayList();
- for (JsonElement element : dataArray) {
- reportsList.add(element.getAsJsonObject().get("id").getAsString());
- }
-
- Iterator it = reportsList.iterator();
- while (it.hasNext()) {
- String reportId = it.next().toString();
- String url = dataciteBaseURL + reportId;
-
- try {
- BufferedInputStream in = new BufferedInputStream(new URL(url).openStream());
- BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- String inputStr;
- StringBuilder responseStrBuilder2 = new StringBuilder();
- while ((inputStr = streamReader.readLine()) != null) {
- responseStrBuilder2.append(inputStr);
- }
- FileSystem fs = FileSystem.get(new Configuration());
- FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"),
- true);
- byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
-
- fin.close();
-
- fin.close();
- } catch (IOException e) {
- System.out.println(e);
- }
- }
- }
-}
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+/**
+ * @author dpie
+ */
+public class DownloadReportsListFromDatacite {
+
+ private String dataciteBaseURL;
+ private String dataciteReportPath;
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath)
+ throws MalformedURLException, Exception {
+
+ this.dataciteBaseURL = dataciteBaseURL;
+ this.dataciteReportPath = dataciteReportPath;
+ }
+
+ public void downloadReportsList() throws ParseException {
+ StringBuilder responseStrBuilder = new StringBuilder();
+
+ Gson gson = new Gson();
+
+ try {
+ BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream());
+ logger.info("Downloading from " + dataciteBaseURL);
+
+ BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ String inputStr;
+
+ while ((inputStr = streamReader.readLine()) != null) {
+ responseStrBuilder.append(inputStr);
+ }
+ } catch (IOException e) {
+ logger.info(e.getMessage());
+ }
+ JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class);
+ JsonArray dataArray = jsonObject.getAsJsonArray("reports");
+ ArrayList reportsList = new ArrayList();
+ for (JsonElement element : dataArray) {
+ reportsList.add(element.getAsJsonObject().get("id").getAsString());
+ }
+
+ Iterator it = reportsList.iterator();
+ while (it.hasNext()) {
+ String reportId = it.next().toString();
+ String url = dataciteBaseURL + reportId;
+
+ try {
+ BufferedInputStream in = new BufferedInputStream(new URL(url).openStream());
+ BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ String inputStr;
+ StringBuilder responseStrBuilder2 = new StringBuilder();
+ while ((inputStr = streamReader.readLine()) != null) {
+ responseStrBuilder2.append(inputStr);
+ }
+ FileSystem fs = FileSystem.get(new Configuration());
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(dataciteReportPath + "/" + reportId + ".json"),
+ true);
+ byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ fin.close();
+
+ fin.close();
+ } catch (IOException e) {
+ System.out.println(e);
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
index 7b3db3115..b28578e4b 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java
@@ -18,14 +18,13 @@ public class ExecuteWorkflow {
static String dataciteBaseURL;
static String dataciteReportPath;
- static String dbHiveUrl;
- static String dbImpalaUrl;
- static String datasetUsageStatsDBSchema;
- static String statsDBSchema;
- static boolean recreateDbAndTables;
- static boolean datasetsEmptyDirs;
- static boolean finalTablesVisibleToImpala;
-
+ static String dbHiveUrl;
+ static String dbImpalaUrl;
+ static String datasetUsageStatsDBSchema;
+ static String statsDBSchema;
+ static boolean recreateDbAndTables;
+ static boolean datasetsEmptyDirs;
+ static boolean finalTablesVisibleToImpala;
public static void main(String args[]) throws Exception {
@@ -58,11 +57,11 @@ public class ExecuteWorkflow {
else
datasetsEmptyDirs = false;
- if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
- finalTablesVisibleToImpala = true;
- else
- finalTablesVisibleToImpala = false;
-
+// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
+// finalTablesVisibleToImpala = true;
+// else
+// finalTablesVisibleToImpala = false;
+//
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
}
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
new file mode 100644
index 000000000..ccb3eebd3
--- /dev/null
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
@@ -0,0 +1,408 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.datasetsusagestats.export;
+
+import java.io.*;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.sql.Array;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+/**
+ * @author dpie
+ */
+public class ReadReportsListFromDatacite {
+
+ private String dataciteReportPath;
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception {
+
+ this.dataciteReportPath = dataciteReportPath;
+ }
+
+ public void readReports() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+ File folder = new File(dataciteReportPath);
+ ArrayList jsonFiles = listHdfsDir(dataciteReportPath);
+ for (String jsonFile : jsonFiles) {
+ logger.info("Reading report file " + jsonFile);
+ this.createTmpReportsTable(jsonFile);
+
+ String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM "
+ + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlSelectReportID);
+ ResultSet rstmpReportID = stmt.getResultSet();
+
+ String reportID = null;
+ while (rstmpReportID.next()) {
+ reportID = rstmpReportID.getString(1);
+ }
+
+ logger.info("Checking report with id " + reportID);
+ String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".datacitereports where reportid=?";
+ PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists);
+ stGetReportID.setString(1, reportID);
+
+ ResultSet rsCheckIfReportExist = stGetReportID.executeQuery();
+
+ if (rsCheckIfReportExist.next()) {
+ logger.info("Report found with ID " + reportID);
+ dropTmpReportsTable();
+ } else {
+ String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + " .datacitereports "
+ + "SELECT\n"
+ + " get_json_object(json, '$.report.id') AS reportid,\n"
+ + " get_json_object(json, '$.report.report-header.report-name') AS name,\n"
+ + " get_json_object(json, '$.report.report-header.report-id') AS source,\n"
+ + " get_json_object(json, '$.report.report-header.release') AS release,\n"
+ + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n"
+ + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n"
+ + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n"
+ + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlInsertReport);
+
+ logger.info("Report added");
+
+ logger.info("Adding datasets");
+ String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM "
+ + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlSelecteDatasetsArray);
+ ResultSet rstmpReportDatasets = stmt.getResultSet();
+
+ if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) {
+ String[] listDatasets = rstmpReportDatasets.getString(1).split(",");
+ logger.info("Datasets found " + listDatasets.length);
+
+ for (int i = 0; i < listDatasets.length; i++) {
+
+ String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + " .datasets "
+ + "SELECT\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-id[0].value') AS ds_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-title') AS ds_title,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].publisher-id.type[0]') AS publisher_id_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].publisher-id.value[0]') AS publisher_id_value,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-dates.type[0]') AS ds_dates_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-dates.value[0]') AS ds_dates_value,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-contributors') AS ds_contributors,\n"
+ + " get_json_object(json, '$.report.id') AS reportid \n"
+ + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlInsertDataSets);
+
+ logger.info("Dataset added " + i);
+
+ logger.info("Adding Dataset Performance");
+ String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets["
+ + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlSelecteDatasetsPerformance);
+ ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet();
+ if (rstmpReportDatasetsPerformance.next()
+ && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) {
+ String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(",");
+ logger.info("Datasets Performance found " + listDatasetsPerformance.length);
+ for (int j = 0; j < listDatasetsPerformance.length; j++) {
+ String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets["
+ + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".tmpjson";
+ stmt.execute(sqlSelecteDatasetsPerformanceInstance);
+ ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet();
+ if (rstmpReportDatasetsPerformanceInstance.next()
+ && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) {
+ String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance
+ .getString(1)
+ .split(",");
+ logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length);
+ for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) {
+ String sqlInsertDataSetsPerformance = "INSERT INTO "
+ + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance "
+ + "SELECT\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].dataset-id[0].value') AS ds_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].period.end-date') AS period_end,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].period.begin-date') AS period_from,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].instance[" + k
+ + "].access-method') AS access_method,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].instance[" + k
+ + "].metric-type') AS metric_type,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].instance[" + k + "].count') AS count,\n"
+ + " get_json_object(json, '$.report.report-datasets[" + i
+ + "].performance[" + j + "].instance[" + k
+ + "].country-counts') AS country_counts,\n"
+ + " get_json_object(json, '$.report.id') AS reportid \n"
+ + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlInsertDataSetsPerformance);
+ }
+ }
+ }
+ }
+ logger.info("DatasetPerformance added for dataset" + i);
+ }
+ }
+ logger.info("Adding gzip performance");
+ String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM "
+ + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(sqlSelecteReportSubsets);
+ ResultSet rstmpReportSubsets = stmt.getResultSet();
+ if (rstmpReportSubsets.next()) {
+ String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1));
+ this.readCompressedReport(unCompressedReport, reportID);
+ }
+ }
+ }
+ this.dropTmpReportsTable();
+ }
+
+ public void readCompressedReport(String report, String reportId) throws Exception {
+ Gson gson = new Gson();
+ JsonObject jsonObject = gson.fromJson(report, JsonObject.class);
+
+ JsonArray jsonReportDatasets;
+ if (jsonObject.getAsJsonArray("report_datasets") != null) {
+ jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets");
+ } else {
+ jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets");
+ }
+
+ for (JsonElement datasetElement : jsonReportDatasets) {
+ // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title");
+ String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString();
+ String yop = datasetElement.getAsJsonObject().get("yop").getAsString();
+ String uri = datasetElement.getAsJsonObject().get("uri").getAsString();
+ String platform = datasetElement.getAsJsonObject().get("platform").getAsString();
+ String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString();
+ String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString();
+
+ JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id");
+ String publisher_id_type = "";
+ String publisher_id_value = "";
+ for (JsonElement publisher_id_Element : publisher_id) {
+ publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString();
+ publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString();
+ }
+ JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates");
+ String ds_dates_type = "";
+ String ds_dates_value = "";
+ for (JsonElement datasetDaysElement : dataset_days) {
+ ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString();
+ ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString();
+ }
+
+ JsonArray datasetContributors = null;
+ String ds_contributor_type = "";
+ String[] ds_contributor_values = null;
+ Array ds_contributor_valuesArr = null;
+
+ if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) {
+ datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors");
+
+ JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id");
+ String doi = "";
+ for (JsonElement datasetIDElement : datasetid)
+//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString());
+ {
+ doi = datasetIDElement.getAsJsonObject().get("value").getAsString();
+ }
+
+ String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + " .datasets(ds_type,"
+ + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value,"
+ + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) ";
+
+ PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset);
+
+ pstmtDataset.setString(1, doi);
+ pstmtDataset.setString(2, dataset_title);
+ pstmtDataset.setString(3, yop);
+ pstmtDataset.setString(4, uri);
+ pstmtDataset.setString(5, platform);
+ pstmtDataset.setString(6, data_type);
+ pstmtDataset.setString(7, publisher);
+ pstmtDataset.setString(8, publisher_id_type);
+ pstmtDataset.setString(9, publisher_id_value);
+ pstmtDataset.setString(10, ds_dates_type);
+ pstmtDataset.setString(11, ds_dates_value);
+ pstmtDataset.setString(13, datasetContributors.getAsString());
+ pstmtDataset.setString(14, reportId);
+
+ pstmtDataset.execute();
+ logger.info("Dataset from compressed report addded " + doi);
+ /*
+ * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for
+ * (JsonElement performanceElement : performance) { JsonObject period =
+ * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date =
+ * period.getAsJsonObject().get("end-date").getAsString(); String begin_date =
+ * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance =
+ * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement :
+ * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject
+ * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set
+ * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[]
+ * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0;
+ * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] =
+ * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text",
+ * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype
+ * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod =
+ * instanceElement.getAsJsonObject().get("access-method").getAsString(); String
+ * sqlInsertDatasetPerformance =
+ * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)"
+ * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance);
+ * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count);
+ * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date);
+ * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod);
+ * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count);
+ * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8,
+ * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute();
+ * } }
+ */
+ }
+ }
+
+ }
+
+ private ArrayList listHdfsDir(String dir) throws Exception {
+
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ fileNames.add(fileName);
+ }
+
+ hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir));
+ throw new Exception("HDFS file path with exported data does not exist : " + dir, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ sb.append(line);
+ // sb.append(line);
+ line = br.readLine();
+ }
+ // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ result = sb.toString().trim();
+ // fs.close();
+ } catch (Exception e) {
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+ public static String uncompressString(String zippedBase64Str)
+ throws IOException {
+ String result = null;
+
+ // In my solr project, I use org.apache.solr.common.util.Base64.
+ // byte[] bytes =
+ // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str);
+ byte[] bytes = Base64.getDecoder().decode(zippedBase64Str);
+ GZIPInputStream zi = null;
+ try {
+ zi = new GZIPInputStream(new ByteArrayInputStream(bytes));
+ result = IOUtils.toString(zi);
+ } finally {
+ IOUtils.closeQuietly(zi);
+ }
+ return result;
+ }
+
+ private void createTmpReportsTable(String jsonFile) throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ dropTmpReportsTable();
+ String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema()
+ + ".tmpjson (json STRING)";
+ stmt.executeUpdate(createTmpTable);
+ logger.info("Tmp Table Created");
+
+ String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE "
+ + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ stmt.execute(insertJsonReport);
+ logger.info("JSON Report File inserted to tmpjson Table");
+ }
+
+ private void dropTmpReportsTable() throws SQLException {
+ logger.info("Dropping tmpjson Table");
+ String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ stmt.executeUpdate(dropTmpTable);
+ logger.info("Dropped tmpjson Table");
+
+ }
+
+}
+
+/*
+ * PreparedStatement prepStatem = conn.
+ * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)"
+ * );
+ */
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
index 28c4f30a1..7b07fbc25 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.io.IOException;
@@ -17,220 +18,94 @@ import org.slf4j.LoggerFactory;
*/
public class UsageStatsExporter {
- private Statement stmt = null;
+ private Statement stmt = null;
- public UsageStatsExporter() {
+ public UsageStatsExporter() {
- }
+ }
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
- private void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
+ private void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
- logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath);
- dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true);
+ logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath);
+ dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true);
- logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath));
+ logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath));
- }
+ }
- public void export() throws Exception {
+ public void export() throws Exception {
- logger.info("Initialising DB properties");
- ConnectDB.init();
- ConnectDB.getHiveConnection();
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
+ ConnectDB.getHiveConnection();
- if (ExecuteWorkflow.recreateDbAndTables) {
- createDatabase();
- createTables();
- reCreateLogDirs();
- }
- logger.info("Initializing the download logs module");
- DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath);
+ if (ExecuteWorkflow.recreateDbAndTables) {
+ DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", "");
+ datasetsDB.recreateDBAndTables();
+ }
+ logger.info("Initializing the download logs module");
+ DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL,
+ ExecuteWorkflow.dataciteReportPath);
- if (ExecuteWorkflow.datasetsEmptyDirs) {
- logger.info("Downloading Reports List From Datacite");
- drfd.downloadReportsList();
- logger.info("Reports List has been downloaded");
- }
- }
+ if (ExecuteWorkflow.datasetsEmptyDirs) {
+ logger.info("Downloading Reports List From Datacite");
+ drfd.downloadReportsList();
+ logger.info("Reports List has been downloaded");
+ }
- private void createDatabase() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
- stmt.executeUpdate(dropDatabase);
- } catch (Exception e) {
- logger.error("Failed to drop database: " + e);
- throw new Exception("Failed to drop database: " + e.toString(), e);
- }
-
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
- String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
- stmt.executeUpdate(createDatabase);
-
- } catch (Exception e) {
- logger.error("Failed to create database: " + e);
- throw new Exception("Failed to create database: " + e.toString(), e);
- }
- }
-
- private void createTables() throws Exception {
- try {
- stmt = ConnectDB.getHiveConnection().createStatement();
-
- // Create Reports table - This table should exist
- String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getDataSetUsageStatsDBSchema()
- + ".datacitereports(reportid STRING, \n"
- + " name STRING, \n"
- + " source STRING,\n"
- + " release STRING,\n"
- + " createdby STRING,\n"
- + " report_end_date STRING,\n"
- + " report_start_date STRING)\n"
- + " CLUSTERED BY (reportid)\n"
- + " into 100 buckets stored as orc tblproperties('transactional'='true')";
-
- stmt.executeUpdate(sqlCreateTableDataciteeReports);
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
-
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
+ ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite(
+ ExecuteWorkflow.dataciteReportPath);
+ logger.info("Store Reports To DB");
+ readReportsListFromDatacite.readReports();
+ logger.info("Reports Stored To DB");
+ }
// runImpalaQuery();
-/*
- PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
-
- logger.info("Re-creating database and tables");
-
- logger.info("Initializing the download logs module");
- PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
-
- if (ExecuteWorkflow.piwikEmptyDirs) {
- logger.info("Recreating Piwik log directories");
- piwikstatsdb.reCreateLogDirs();
- }
-
- // Downloading piwik logs (also managing directory creation)
- if (ExecuteWorkflow.downloadPiwikLogs) {
- logger.info("Downloading piwik logs");
- piwd
- .GetOpenAIRELogs(
- ExecuteWorkflow.repoLogPath,
- ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
- }
- logger.info("Downloaded piwik logs");
-
- // Create DB tables, insert/update statistics
- String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
- piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
-
- if (ExecuteWorkflow.processPiwikLogs) {
- logger.info("Processing logs");
- piwikstatsdb.processLogs();
- }
-
- logger.info("Creating LaReferencia tables");
- LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
- ExecuteWorkflow.lareferenciaAuthToken);
-
- if (ExecuteWorkflow.laReferenciaEmptyDirs) {
- logger.info("Recreating LaReferencia log directories");
- lrf.reCreateLogDirs();
- }
-
- if (ExecuteWorkflow.downloadLaReferenciaLogs) {
- logger.info("Downloading LaReferencia logs");
- lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
- logger.info("Downloaded LaReferencia logs");
- }
- LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
-
- if (ExecuteWorkflow.processLaReferenciaLogs) {
- logger.info("Processing LaReferencia logs");
- lastats.processLogs();
- logger.info("LaReferencia logs done");
- }
-
- IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
- if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
- logger.info("Creating Irus Stats tables");
- irusstats.createTables();
- logger.info("Created Irus Stats tables");
-
- logger.info("Re-create log dirs");
- irusstats.reCreateLogDirs();
- logger.info("Re-created log dirs");
- }
-
- if (ExecuteWorkflow.irusDownloadReports) {
- irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
- }
- if (ExecuteWorkflow.irusProcessStats) {
- irusstats.processIrusStats();
- logger.info("Irus done");
- }
-
- SarcStats sarcStats = new SarcStats();
- if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
- sarcStats.reCreateLogDirs();
- }
- if (ExecuteWorkflow.sarcDownloadReports) {
- sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- }
- if (ExecuteWorkflow.sarcProcessStats) {
- sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- sarcStats.finalizeSarcStats();
- }
- logger.info("Sarc done");
-
- // finalize usagestats
- if (ExecuteWorkflow.finalizeStats) {
- piwikstatsdb.finalizeStats();
- logger.info("Finalized stats");
- }
-
- // Make the tables available to Impala
- if (ExecuteWorkflow.finalTablesVisibleToImpala) {
- logger.info("Making tables visible to Impala");
- invalidateMetadata();
- }
-
- logger.info("End");
- */
+ /*
+ * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+ * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module");
+ * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
+ * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories");
+ * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if
+ * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs(
+ * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); }
+ * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl =
+ * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
+ * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) {
+ * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables");
+ * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
+ * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) {
+ * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if
+ * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs");
+ * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); }
+ * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if
+ * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs();
+ * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if
+ * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables");
+ * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs");
+ * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) {
+ * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) {
+ * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if
+ * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if
+ * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray,
+ * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) {
+ * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if
+ * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the
+ * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) {
+ * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End");
+ */
}
/*
- private void invalidateMetadata() throws SQLException {
- Statement stmt = null;
-
- stmt = ConnectDB.getImpalaConnection().createStatement();
-
- String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats";
- stmt.executeUpdate(sql);
-
- sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats";
- stmt.executeUpdate(sql);
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- }
+ * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt =
+ * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " +
+ * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA "
+ * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
+ * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
+ * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close();
+ * ConnectDB.getHiveConnection().close(); }
*/
diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
index 338b2a2c5..44f28ff56 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@@ -2,13 +2,13 @@
-
-
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.6.1
+
+
+ 1.8
+
+
+
+
+
+ UTF-8
+ UTF-8
0.13.1-cdh5.2.1
2.5.0-cdh5.2.1
-
-
+
+
org.apache.spark
@@ -53,16 +81,16 @@
20180130
jar
-
- org.apache.hive
- hive-jdbc
- ${cdh.hive.version}
-
-
- org.apache.hadoop
- hadoop-common
- ${cdh.hadoop.version}
-
+
+ org.apache.hive
+ hive-jdbc
+ ${cdh.hive.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${cdh.hadoop.version}
+
eu.dnetlib.dhp
dhp-common
diff --git a/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh
new file mode 100755
index 000000000..4465dae21
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh
@@ -0,0 +1 @@
+mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java
index f76644c83..5b2e6804b 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java
@@ -122,4 +122,4 @@ public abstract class ConnectDB {
}
-}
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java
index 81e34b3e7..e0e0d3687 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java
@@ -62,7 +62,6 @@ public class ExecuteWorkflow {
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats;
- static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads;
@@ -98,98 +97,108 @@ public class ExecuteWorkflow {
usageStatsDBSchema = parser.get("usageStatsDBSchema");
statsDBSchema = parser.get("statsDBSchema");
- if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
+ if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
recreateDbAndTables = true;
- else
+ } else {
recreateDbAndTables = false;
+ }
- if (parser.get("piwikEmptyDirs").toLowerCase().equals("true"))
+ if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) {
piwikEmptyDirs = true;
- else
+ } else {
piwikEmptyDirs = false;
+ }
- if (parser.get("downloadPiwikLogs").toLowerCase().equals("true"))
+ if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) {
downloadPiwikLogs = true;
- else
+ } else {
downloadPiwikLogs = false;
+ }
- if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
+ if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
processPiwikLogs = true;
- else
+ } else {
processPiwikLogs = false;
+ }
- String startingLogPeriodStr = parser.get("startingLogPeriod");
+ String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
- String endingLogPeriodStr = parser.get("endingLogPeriod");
- Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
- endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
+// String endingLogPeriodStr = parser.get("endingLogPeriod");
+// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
+// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
- if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true"))
+ if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) {
laReferenciaEmptyDirs = true;
- else
+ } else {
laReferenciaEmptyDirs = false;
+ }
- if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true"))
+ if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) {
downloadLaReferenciaLogs = true;
- else
+ } else {
downloadLaReferenciaLogs = false;
+ }
- if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true"))
+ if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
processLaReferenciaLogs = true;
- else
+ } else {
processLaReferenciaLogs = false;
+ }
- if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true"))
+ if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) {
irusCreateTablesEmptyDirs = true;
- else
+ } else {
irusCreateTablesEmptyDirs = false;
+ }
- if (parser.get("irusDownloadReports").toLowerCase().equals("true"))
+ if (parser.get("irusDownloadReports").toLowerCase().equals("true")) {
irusDownloadReports = true;
- else
+ } else {
irusDownloadReports = false;
+ }
- if (parser.get("irusProcessStats").toLowerCase().equals("true"))
+ if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
irusProcessStats = true;
- else
+ } else {
irusProcessStats = false;
+ }
irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
- if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true"))
+ if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) {
sarcCreateTablesEmptyDirs = true;
- else
+ } else {
sarcCreateTablesEmptyDirs = false;
+ }
- if (parser.get("sarcDownloadReports").toLowerCase().equals("true"))
+ if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) {
sarcDownloadReports = true;
- else
+ } else {
sarcDownloadReports = false;
+ }
- if (parser.get("sarcProcessStats").toLowerCase().equals("true"))
+ if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
sarcProcessStats = true;
- else
+ } else {
sarcProcessStats = false;
+ }
sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
-/*
- if (parser.get("finalizeStats").toLowerCase().equals("true"))
+ if (parser.get("finalizeStats").toLowerCase().equals("true")) {
finalizeStats = true;
- else
+ } else {
finalizeStats = false;
- if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
- finalTablesVisibleToImpala = true;
- else
- finalTablesVisibleToImpala = false;
-*/
+ }
+
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
+ // usagestatsExport.createdDBWithTablesOnly();
}
private static Calendar startingLogPeriodStr(Date date) {
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java
index bb8d8565e..7ec5b0fca 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@@ -27,393 +28,331 @@ import org.slf4j.LoggerFactory;
*/
public class IrusStats {
- private String irusUKURL;
+ private String irusUKURL;
- private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
+ private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
- public IrusStats(String irusUKURL) throws Exception {
- this.irusUKURL = irusUKURL;
- // The following may not be needed - It will be created when JSON tables are created
+ public IrusStats(String irusUKURL) throws Exception {
+ this.irusUKURL = irusUKURL;
+ // The following may not be needed - It will be created when JSON tables are created
// createTmpTables();
- }
+ }
- public void reCreateLogDirs() throws Exception {
- FileSystem dfs = FileSystem.get(new Configuration());
+ public void reCreateLogDirs() throws Exception {
+ FileSystem dfs = FileSystem.get(new Configuration());
- logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
- dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
+ logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
- logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
- }
+ logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
+ }
- public void createTables() throws Exception {
- try {
- logger.info("Creating sushilog");
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog(source STRING, "
- + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
- + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableSushiLog);
- logger.info("Created sushilog");
+ public void createTables() throws Exception {
+ try {
+ logger.info("Creating sushilog");
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog(source STRING, "
+ + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableSushiLog);
+ logger.info("Created sushilog");
- // To see how to apply to the ignore duplicate rules and indexes
-// stmt.executeUpdate(sqlCreateTableSushiLog);
-// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO sushilog "
-// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
-// + "sushilog.rid, sushilog.date "
-// + "FROM sushilog "
-// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlcreateRuleSushiLog);
-// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
-// stmt.executeUpdate(createSushiIndex);
- stmt.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Sushi Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
-// // The following may not be needed - It will be created when JSON tables are created
-// private void createTmpTables() throws Exception {
-// try {
-//
-// Statement stmt = ConnectDB.getConnection().createStatement();
-// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
-// stmt.executeUpdate(sqlCreateTableSushiLog);
-//
-// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
-// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
-// // stmt.executeUpdate(sqlCopyPublicSushiLog);
-// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO sushilogtmp "
-// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
-// + "sushilogtmp.rid, sushilogtmp.date "
-// + "FROM sushilogtmp "
-// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlcreateRuleSushiLog);
-//
-// stmt.close();
-// ConnectDB.getConnection().close();
-// log.info("Sushi Tmp Tables Created");
-// } catch (Exception e) {
-// log.error("Failed to create tables: " + e);
-// throw new Exception("Failed to create tables: " + e.toString(), e);
-// }
-// }
- public void processIrusStats() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ public void processIrusStats() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
- logger.info("Dropping sushilogtmp_json table");
- String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sushilogtmp_json";
- stmt.executeUpdate(dropSushilogtmpJson);
- logger.info("Dropped sushilogtmp_json table");
+ logger.info("Dropping sushilogtmp_json table");
+ String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilogtmp_json";
+ stmt.executeUpdate(dropSushilogtmpJson);
+ logger.info("Dropped sushilogtmp_json table");
- logger.info("Creating irus_sushilogtmp_json table");
- String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
- + " `ItemIdentifier` ARRAY<\n"
- + " struct<\n"
- + " Type: STRING,\n"
- + " Value: STRING\n"
- + " >\n"
- + " >,\n"
- + " `ItemPerformance` ARRAY<\n"
- + " struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >\n"
- + " >\n"
- + ")\n"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(createSushilogtmpJson);
- logger.info("Created irus_sushilogtmp_json table");
+ logger.info("Creating irus_sushilogtmp_json table");
+ String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " Type: STRING,\n"
+ + " Value: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` ARRAY<\n"
+ + " struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(createSushilogtmpJson);
+ logger.info("Created irus_sushilogtmp_json table");
- logger.info("Dropping irus_sushilogtmp table");
- String dropSushilogtmp = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp";
- stmt.executeUpdate(dropSushilogtmp);
- logger.info("Dropped irus_sushilogtmp table");
+ logger.info("Dropping irus_sushilogtmp table");
+ String dropSushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(dropSushilogtmp);
+ logger.info("Dropped irus_sushilogtmp table");
- logger.info("Creating irus_sushilogtmp table");
- String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp(source STRING, repository STRING, "
- + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
- + "tblproperties('transactional'='true')";
- stmt.executeUpdate(createSushilogtmp);
- logger.info("Created irus_sushilogtmp table");
+ logger.info("Creating irus_sushilogtmp table");
+ String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(createSushilogtmp);
+ logger.info("Created irus_sushilogtmp table");
- logger.info("Inserting to irus_sushilogtmp table");
- String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
- + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
- + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
- + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
- + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
- + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
- + "WHERE `ItemIdent`.`Type`= 'OAI'";
- stmt.executeUpdate(insertSushilogtmp);
- logger.info("Inserted to irus_sushilogtmp table");
-/*
- logger.info("Creating downloads_stats table");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmt.executeUpdate(createDownloadsStats);
- logger.info("Created downloads_stats table");
+ logger.info("Inserting to irus_sushilogtmp table");
+ String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
+ + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
+ + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
+ + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
+ + "WHERE `ItemIdent`.`Type`= 'OAI'";
+ stmt.executeUpdate(insertSushilogtmp);
+ logger.info("Inserted to irus_sushilogtmp table");
- logger.info("Inserting into downloads_stats");
- String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_oids ro "
- + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
- stmt.executeUpdate(insertDStats);
- logger.info("Inserted into downloads_stats");
+ logger.info("Inserting to sushilog table");
+ String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(insertToShushilog);
+ logger.info("Inserted to sushilog table");
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmt.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
-*/
- logger.info("Inserting to sushilog table");
- String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
- + ConnectDB.getUsageStatsDBSchema()
- + ".irus_sushilogtmp";
- stmt.executeUpdate(insertToShushilog);
- logger.info("Inserted to sushilog table");
+ ConnectDB.getHiveConnection().close();
+ }
- ConnectDB.getHiveConnection().close();
- }
+ public void getIrusRRReport(String irusUKReportPath) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
- public void getIrusRRReport(String irusUKReportPath) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
+ logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
- String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
- + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
- + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
+ String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
+ + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
- logger.info("(getIrusRRReport) Getting report: " + reportUrl);
+ logger.info("(getIrusRRReport) Getting report: " + reportUrl);
- String text = getJson(reportUrl, "", "");
+ String text = getJson(reportUrl, "", "");
- List opendoarsToVisit = new ArrayList();
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = (JSONObject) parser.parse(text);
- jsonObject = (JSONObject) jsonObject.get("ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Customer");
- JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
- int i = 0;
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
- for (Object identifier : itemIdentifier) {
- JSONObject opendoar = (JSONObject) identifier;
- if (opendoar.get("Type").toString().equals("OpenDOAR")) {
- i++;
- opendoarsToVisit.add(opendoar.get("Value").toString());
- break;
- }
- }
- // break;
- }
+ List opendoarsToVisit = new ArrayList();
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ if (jsonArray != null) {
+ int i = 0;
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
+ for (Object identifier : itemIdentifier) {
+ JSONObject opendoar = (JSONObject) identifier;
+ if (opendoar.get("Type").toString().equals("OpenDOAR")) {
+ i++;
+ opendoarsToVisit.add(opendoar.get("Value").toString());
+ break;
+ }
+ }
+ // break;
+ }
- logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
+ logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
- if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
- && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
- opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
- }
+ if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
+ && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ }
- logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
+ logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
- for (String opendoar : opendoarsToVisit) {
- logger.info("Now working on openDoar: " + opendoar);
- this.getIrusIRReport(opendoar, irusUKReportPath);
- }
+ for (String opendoar : opendoarsToVisit) {
+ logger.info("Now working on openDoar: " + opendoar);
+ this.getIrusIRReport(opendoar, irusUKReportPath);
+ }
+ logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
+ } else {
+ logger.info("IRUS Reports not found for day");
+ }
- logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
- }
+ }
- private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
+ private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
- logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
+ logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+ // Setting the ending period (last day of the month)
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
- st.setString(1, "opendoar____::" + opendoar);
- ResultSet rs_date = st.executeQuery();
- Date dateMax = null;
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
- int batch_size = 0;
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
- if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
- } else {
- while (start.before(end)) {
- logger.info("date: " + simpleDateFormat.format(start.getTime()));
- String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
- + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
- + "&RepositoryIdentifier=opendoar%3A" + opendoar
- + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
- start.add(Calendar.MONTH, 1);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, "opendoar____::" + opendoar);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+ int batch_size = 0;
- logger.info("Downloading file: " + reportUrl);
- String text = getJson(reportUrl, "", "");
- if (text == null) {
- continue;
- }
+ if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
+ } else {
+ start.add(Calendar.MONTH, 1);
+ while (start.before(end)) {
+ logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime()));
+ String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ + "&RepositoryIdentifier=opendoar%3A" + opendoar
+ + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
+ start.add(Calendar.MONTH, 1);
- FileSystem fs = FileSystem.get(new Configuration());
- String filePath = irusUKReportPath + "/" + "IrusIRReport_"
- + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePath);
- FSDataOutputStream fin = fs.create(new Path(filePath), true);
+ logger.info("Downloading file: " + reportUrl);
+ String text = getJson(reportUrl, "", "");
+ if (text == null) {
+ continue;
+ }
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = (JSONObject) parser.parse(text);
- jsonObject = (JSONObject) jsonObject.get("ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Report");
- jsonObject = (JSONObject) jsonObject.get("Customer");
- JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
- if (jsonArray == null) {
- continue;
- }
- String oai = "";
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- fin.write(jsonObjectRow.toJSONString().getBytes());
- fin.writeChar('\n');
- }
+ FileSystem fs = FileSystem.get(new Configuration());
+ String filePath = irusUKReportPath + "/" + "IrusIRReport_"
+ + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePath);
+ FSDataOutputStream fin = fs.create(new Path(filePath), true);
- fin.close();
- }
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ if (jsonArray == null) {
+ continue;
+ }
+ String oai = "";
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRow.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
- }
- //ConnectDB.getHiveConnection().close();
+ fin.close();
+ }
- logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
- }
+ }
+ // ConnectDB.getHiveConnection().close();
- private String getJson(String url) throws Exception {
- try {
- System.out.println("===> Connecting to: " + url);
- URL website = new URL(url);
- System.out.println("Connection url -----> " + url);
- URLConnection connection = website.openConnection();
+ logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
+ }
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
+ private String getJson(String url) throws Exception {
+ try {
+ System.out.println("===> Connecting to: " + url);
+ URL website = new URL(url);
+ System.out.println("Connection url -----> " + url);
+ URLConnection connection = website.openConnection();
+
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
// response.append("\n");
- }
- }
+ }
+ }
- System.out.println("response ====> " + response.toString());
+ System.out.println("response ====> " + response.toString());
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + e);
- System.out.println("Failed to get URL: " + e);
- throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
- private String getJson(String url, String username, String password) throws Exception {
- // String cred=username+":"+password;
- // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- response.append("\n");
- }
- }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL", e);
- return null;
- }
- }
+ private String getJson(String url, String username, String password) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL", e);
+ return null;
+ }
+ }
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java
index 88550579b..904290af8 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@@ -27,49 +28,49 @@ import org.slf4j.LoggerFactory;
*/
public class LaReferenciaDownloadLogs {
- private final String piwikUrl;
- private Date startDate;
- private final String tokenAuth;
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
- /*
+ /*
* The Piwik's API method
- */
- private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
- private final String format = "&format=json";
- private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+ private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
- private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
+ private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
- public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
- this.piwikUrl = piwikUrl;
- this.tokenAuth = tokenAuth;
- this.createTables();
+ public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+ this.createTables();
// this.createTmpTables();
- }
+ }
- public void reCreateLogDirs() throws IllegalArgumentException, IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
+ public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
- logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
+ logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
- logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
- dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
- }
+ logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
+ }
- private void createTables() throws Exception {
- try {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ private void createTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
- logger.info("Creating LaReferencia tables");
- String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
- + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
- + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
- + "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTableLareferenciaLog);
- logger.info("Created LaReferencia tables");
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialog "
// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
@@ -80,16 +81,16 @@ public class LaReferenciaDownloadLogs {
// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
- stmt.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Lareferencia Tables Created");
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Lareferencia Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- // System.exit(0);
- }
- }
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
// private void createTmpTables() throws Exception {
//
@@ -114,152 +115,159 @@ public class LaReferenciaDownloadLogs {
// // System.exit(0);
// }
// }
- private String getPiwikLogUrl() {
- return piwikUrl + "/";
- }
+ private String getPiwikLogUrl() {
+ return piwikUrl + "/";
+ }
- private String getJson(String url) throws Exception {
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
+ private String getJson(String url) throws Exception {
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
// response.append("\n");
- }
- }
+ }
+ }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + e);
- throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
- public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
+ public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
- String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
- String content = "";
+ String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
+ String content = "";
- List siteIdsToVisit = new ArrayList();
+ List siteIdsToVisit = new ArrayList();
- // Getting all the siteIds in a list for logging reasons & limiting the list
- // to the max number of siteIds
- content = getJson(baseApiUrl);
- JSONParser parser = new JSONParser();
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
- }
- logger.info("Found the following siteIds for download: " + siteIdsToVisit);
+ // Getting all the siteIds in a list for logging reasons & limiting the list
+ // to the max number of siteIds
+ content = getJson(baseApiUrl);
+ JSONParser parser = new JSONParser();
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
+ }
+ logger.info("Found the following siteIds for download: " + siteIdsToVisit);
- if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
- && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
- siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
- }
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
- logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
+ logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
- for (int siteId : siteIdsToVisit) {
- logger.info("Now working on LaReferencia MatomoId: " + siteId);
- this.GetLaReFerenciaLogs(repoLogsPath, siteId);
- }
- }
+ for (int siteId : siteIdsToVisit) {
+ logger.info("Now working on LaReferencia MatomoId: " + siteId);
+ this.GetLaReFerenciaLogs(repoLogsPath, siteId);
+ }
+ }
- public void GetLaReFerenciaLogs(String repoLogsPath,
- int laReferencialMatomoID) throws Exception {
+ public void GetLaReFerenciaLogs(String repoLogsPath,
+ int laReferencialMatomoID) throws Exception {
- logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
+ logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
- + ".lareferencialog WHERE matomoid=?");
- st.setInt(1, laReferencialMatomoID);
- Date dateMax = null;
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
- ResultSet rs_date = st.executeQuery();
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".lareferencialog WHERE matomoid=?");
+ st.setInt(1, laReferencialMatomoID);
+ Date dateMax = null;
- for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
- Date date = currDay.getTime();
- if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID);
- } else {
- logger
- .info(
- "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
- + sdf.format(date));
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- outFolder = repoLogsPath;
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ Date date = currDay.getTime();
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger
+ .info(
+ "Date found in logs " + dateMax + " and not downloanding Matomo logs for "
+ + laReferencialMatomoID);
+ } else {
+ logger
+ .info(
+ "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
+ + sdf.format(date));
- FileSystem fs = FileSystem.get(new Configuration());
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
- true);
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ outFolder = repoLogsPath;
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
- int i = 0;
+ FileSystem fs = FileSystem.get(new Configuration());
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(
+ outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
+ true);
- JSONParser parser = new JSONParser();
- do {
- String apiUrl = baseApiUrl;
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+ int i = 0;
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
+ JSONParser parser = new JSONParser();
+ do {
+ String apiUrl = baseApiUrl;
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- fin.write(jsonObjectRaw.toJSONString().getBytes());
- fin.writeChar('\n');
- }
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
- logger
- .info(
- "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
- + " and for "
- + sdf.format(date));
- i++;
- } while (true);
- fin.close();
- }
- }
- }
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRaw.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
+
+ logger
+ .info(
+ "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
+ + " and for "
+ + sdf.format(date));
+ i++;
+ } while (true);
+ fin.close();
+ }
+ }
+ }
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java
index c20781767..bcf1711cb 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java
@@ -61,15 +61,6 @@ public class LaReferenciaStats {
"stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables");
-// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialog "
-// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
-// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
-// + "FROM lareferencialog "
-// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
-// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
-// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
stmt.close();
ConnectDB.getHiveConnection().close();
@@ -82,30 +73,6 @@ public class LaReferenciaStats {
}
}
-// private void createTmpTables() throws Exception {
-//
-// try {
-// Statement stmt = ConnectDB.getConnection().createStatement();
-// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
-// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
-// + " ON INSERT TO lareferencialogtmp "
-// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
-// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
-// + "FROM lareferencialogtmp "
-// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
-// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
-// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
-//
-// stmt.close();
-// log.info("Lareferencia Tmp Tables Created");
-//
-// } catch (Exception e) {
-// log.error("Failed to create tmptables: " + e);
-// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
-// // System.exit(0);
-// }
-// }
-
public void processLogs() throws Exception {
try {
logger.info("Processing LaReferencia repository logs");
@@ -116,16 +83,7 @@ public class LaReferenciaStats {
removeDoubleClicks();
logger.info("LaReferencia removed double clicks");
-/********
- logger.info("LaReferencia creating viewsStats");
- viewsStats();
- logger.info("LaReferencia created viewsStats");
- logger.info("LaReferencia creating downloadsStats");
- downloadsStats();
- logger.info("LaReferencia created downloadsStats");
-
-************/
- logger.info("LaReferencia updating Production Tables");
+ logger.info("LaReferencia updating Production Tables");
updateProdTables();
logger.info("LaReferencia updated Production Tables");
@@ -255,88 +213,6 @@ public class LaReferenciaStats {
// conn.close();
}
- public void viewsStats() throws Exception {
-
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Creating la_result_views_monthly_tmp view");
- String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS "
- +
- "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
- "THEN 1 ELSE 0 END) AS openaire_referrer, " +
- "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='action' and " +
- "(source_item_type='oaItem' or source_item_type='repItem') " +
- "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
- "source ORDER BY source, entity_id";
- stmt.executeUpdate(sql);
- logger.info("Created la_result_views_monthly_tmp view");
-
- logger.info("Dropping la_views_stats_tmp table");
- sql = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".la_views_stats_tmp";
- stmt.executeUpdate(sql);
- logger.info("Dropped la_views_stats_tmp table");
-
- logger.info("Creating la_views_stats_tmp table");
- sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " +
- "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
- "max(views) AS count, max(openaire_referrer) AS openaire " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " +
- ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
- "WHERE p.source=d.oid AND p.id=ro.oid " +
- "GROUP BY d.id, ro.id, month " +
- "ORDER BY d.id, ro.id, month";
- stmt.executeUpdate(sql);
- logger.info("Created la_views_stats_tmp table");
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- }
-
- private void downloadsStats() throws Exception {
-
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
-
- logger.info("Creating la_result_downloads_monthly_tmp view");
- String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
- + ".la_result_downloads_monthly_tmp AS " +
- "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
- "THEN 1 ELSE 0 END) AS openaire_referrer, " +
- "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='download' and " +
- "(source_item_type='oaItem' or source_item_type='repItem') " +
- "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
- "source ORDER BY source, entity_id";
- stmt.executeUpdate(sql);
- logger.info("Created la_result_downloads_monthly_tmp view");
-
- logger.info("Dropping la_downloads_stats_tmp table");
- sql = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".la_downloads_stats_tmp";
- stmt.executeUpdate(sql);
- logger.info("Dropped la_downloads_stats_tmp table");
-
- logger.info("Creating la_downloads_stats_tmp table");
- sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " +
- "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
- "max(downloads) AS count, max(openaire_referrer) AS openaire " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " +
- ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
- "WHERE p.source=d.oid AND p.id=ro.oid " +
- "GROUP BY d.id, ro.id, month " +
- "ORDER BY d.id, ro.id, month";
- stmt.executeUpdate(sql);
- logger.info("Created la_downloads_stats_tmp table");
-
- stmt.close();
- ConnectDB.getHiveConnection().close();
- }
-
private void updateProdTables() throws SQLException, Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
@@ -346,40 +222,11 @@ public class LaReferenciaStats {
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
stmt.executeUpdate(sql);
-/*****
- logger.info("Updating views_stats");
- sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
- "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
- stmt.executeUpdate(sql);
-// sql = "insert into public.views_stats select * from la_views_stats_tmp;";
-// stmt.executeUpdate(sql);
-
- logger.info("Updating downloads_stats");
- sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
- stmt.executeUpdate(sql);
-
- logger.info("Inserting data to usage_stats from lareferencia");
- sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
- "SELECT coalesce(ds.source, vs.source) as source, " +
- "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
- "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
- "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
- "coalesce(ds.openaire, 0) as openaire_downloads, " +
- "coalesce(vs.openaire, 0) as openaire_views " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " +
- ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " +
- "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
- stmt.executeUpdate(sql);
- logger.info("Inserted data to usage_stats from lareferencia");
-// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;";
-// stmt.executeUpdate(sql);
-****/
logger.info("Dropping lareferencialogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
- logger.info("Dropped lareferencialogtmp");
- stmt.executeUpdate(sql);
+ logger.info("Dropped lareferencialogtmp");
+ stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java
index 5cc9ec563..a84d6743f 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java
@@ -1,9 +1,12 @@
+
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
import java.net.Authenticator;
import java.net.URL;
import java.net.URLConnection;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
@@ -30,299 +33,299 @@ import org.slf4j.LoggerFactory;
*/
public class PiwikDownloadLogs {
- private final String piwikUrl;
- private Date startDate;
- private final String tokenAuth;
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
- /*
+ /*
* The Piwik's API method
- */
- private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
- private final String format = "&format=json";
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
- private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
+ private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
- public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
- this.piwikUrl = piwikUrl;
- this.tokenAuth = tokenAuth;
+ public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
- }
+ }
- private String getPiwikLogUrl() {
- return "https://" + piwikUrl + "/";
- }
+ private String getPiwikLogUrl() {
+ return "https://" + piwikUrl + "/";
+ }
- private String getJson(String url) throws Exception {
- try {
- logger.debug("Connecting to download the JSON: " + url);
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
+ private String getJson(String url) throws Exception {
+ try {
+ logger.debug("Connecting to download the JSON: " + url);
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- }
- }
- return response.toString();
- } catch (Exception e) {
- logger.error("Failed to get URL: " + url + " Exception: " + e);
- throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
- }
- }
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + url + " Exception: " + e);
+ throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
+ }
+ }
- class WorkerThread implements Runnable {
+ class WorkerThread implements Runnable {
- private Calendar currDay;
- private int siteId;
- private String repoLogsPath;
- private String portalLogPath;
- private String portalMatomoID;
+ private Calendar currDay;
+ private int siteId;
+ private String repoLogsPath;
+ private String portalLogPath;
+ private String portalMatomoID;
- public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws IOException {
- this.currDay = (Calendar) currDay.clone();
- this.siteId = new Integer(siteId);
- this.repoLogsPath = new String(repoLogsPath);
- this.portalLogPath = new String(portalLogPath);
- this.portalMatomoID = new String(portalMatomoID);
- }
+ public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws IOException {
+ this.currDay = (Calendar) currDay.clone();
+ this.siteId = new Integer(siteId);
+ this.repoLogsPath = new String(repoLogsPath);
+ this.portalLogPath = new String(portalLogPath);
+ this.portalMatomoID = new String(portalMatomoID);
+ }
- public void run() {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- System.out
- .println(
- Thread.currentThread().getName() + " (Start) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
- + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
- + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- try {
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ public void run() {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Start) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ try {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- System.out
- .println(
- Thread.currentThread().getName() + " (End) Thread for "
- + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
- + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
- + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
- }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (End) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ }
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
- int i = 0;
+ int i = 0;
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
- i++;
- } while (true);
+ i++;
+ } while (true);
- fs.close();
- }
- }
+ fs.close();
+ }
+ }
- public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
+ public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
- Statement statement = ConnectDB.getHiveConnection().createStatement();
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ Statement statement = ConnectDB.getHiveConnection().createStatement();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- ResultSet rs = statement
- .executeQuery(
- "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
- + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
+ ResultSet rs = statement
+ .executeQuery(
+ "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
- // Getting all the piwikids in a list for logging reasons & limitting the list
- // to the max number of piwikids
- List piwikIdToVisit = new ArrayList();
- //while (rs.next())
- //piwikIdToVisit.add(rs.getInt(1));
- piwikIdToVisit.add(13);
- piwikIdToVisit.add(109);
-
- logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
+ // Getting all the piwikids in a list for logging reasons & limitting the list
+ // to the max number of piwikids
+ List piwikIdToVisit = new ArrayList();
+ while (rs.next()) {
+ piwikIdToVisit.add(rs.getInt(1));
+ }
+ logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
- if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
- && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
- logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
- piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
- }
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
+ logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
- logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
+ logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
- // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
- for (int siteId : piwikIdToVisit) {
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+ // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
+ for (int siteId : piwikIdToVisit) {
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+ // Setting the ending period (last day of the month)
+ // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ // end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
- logger.info("Now working on piwikId: " + siteId);
+ logger.info("Now working on piwikId: " + siteId);
- PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
- .prepareStatement(
- "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklog WHERE source=?");
- st.setInt(1, siteId);
- Date dateMax = null;
- ResultSet rs_date = st.executeQuery();
- while (rs_date.next()) {
- logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
+ PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog WHERE source=?");
+ st.setInt(1, siteId);
+ Date dateMax = null;
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
- for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
- // logger.info("Date used " + currDay.toString());
- // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- // executor.execute(worker);// calling execute method of ExecutorService
- logger.info("Date used " + currDay.getTime().toString());
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ // logger.info("Date used " + currDay.toString());
+ // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ // executor.execute(worker);// calling execute method of ExecutorService
+ logger.info("Date used " + currDay.getTime().toString());
- if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
- } else {
- GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
- }
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
+ } else {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ }
- }
- }
- // executor.shutdown();
- // while (!executor.isTerminated()) {
- // }
- // System.out.println("Finished all threads");
- }
+ }
+ }
+ // executor.shutdown();
+ // while (!executor.isTerminated()) {
+ // }
+ // System.out.println("Finished all threads");
+ }
- public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
- String portalMatomoID) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- Date date = currDay.getTime();
- logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
- String period = "&period=day&date=" + sdf.format(date);
- String outFolder = "";
- if (siteId == Integer.parseInt(portalMatomoID)) {
- outFolder = portalLogPath;
- } else {
- outFolder = repoLogsPath;
- }
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
- String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
- + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
- String content = "";
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
- int i = 0;
+ int i = 0;
- JSONParser parser = new JSONParser();
- StringBuffer totalContent = new StringBuffer();
- FileSystem fs = FileSystem.get(new Configuration());
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
- do {
- int writtenBytes = 0;
- String apiUrl = baseApiUrl;
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
- if (i > 0) {
- apiUrl += "&filter_offset=" + (i * 1000);
- }
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
- content = getJson(apiUrl);
- if (content.length() == 0 || content.equals("[]")) {
- break;
- }
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
- FSDataOutputStream fin = fs
- .create(
- new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json"),
- true);
- JSONArray jsonArray = (JSONArray) parser.parse(content);
- for (Object aJsonArray : jsonArray) {
- JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
- byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
- fin.write(jsonObjectRawBytes);
- fin.writeChar('\n');
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
- writtenBytes += jsonObjectRawBytes.length + 1;
- }
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
- fin.close();
- System.out
- .println(
- Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
- + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
- + ".json");
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
- i++;
- } while (true);
+ i++;
+ } while (true);
- fs.close();
- }
+ fs.close();
+ }
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
index 4903d9599..9144620b7 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
@@ -60,7 +60,7 @@ public class PiwikStatsDB {
this.createTables();
// The piwiklog table is not needed since it is built
// on top of JSON files
- ////////////this.createTmpTables();
+ //////////// this.createTmpTables();
}
public ArrayList getRobotsList() {
@@ -86,6 +86,7 @@ public class PiwikStatsDB {
logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
+
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
@@ -117,10 +118,15 @@ public class PiwikStatsDB {
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLog);
+// String dropT = "TRUNCATE TABLE "
+// + ConnectDB.getUsageStatsDBSchema()
+// + ".piwiklog ";
+// stmt.executeUpdate(dropT);
+// logger.info("truncated piwiklog");
+
/////////////////////////////////////////
// Rule for duplicate inserts @ piwiklog
/////////////////////////////////////////
-
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
@@ -131,7 +137,6 @@ public class PiwikStatsDB {
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log
//////////////////////////////////////////////////
-
stmt.close();
ConnectDB.getHiveConnection().close();
@@ -141,47 +146,6 @@ public class PiwikStatsDB {
}
}
-/***** public void createTmpTables() throws Exception {
- try {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
- + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
- + "stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ piwiklogtmp
- //////////////////////////////////////////////////
-
- //////////////////////////////////////////////////
- // Copy from public.piwiklog to piwiklog
- //////////////////////////////////////////////////
- // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
- // stmt.executeUpdate(sqlCopyPublicPiwiklog);
-
- String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
- + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
- + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
- stmt.executeUpdate(sqlCreateTmpTablePortalLog);
-
- //////////////////////////////////////////////////
- // Rule for duplicate inserts @ process_portal_log_tmp
- //////////////////////////////////////////////////
-
- stmt.close();
-
- } catch (Exception e) {
- logger.error("Failed to create tmptables: " + e);
- throw new Exception("Failed to create tmp tables: " + e.toString(), e);
- // System.exit(0);
- }
- }
-******/
public void processLogs() throws Exception {
try {
ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
@@ -203,23 +167,17 @@ public class PiwikStatsDB {
processPortalLog();
logger.info("Portal logs process done");
- logger.info("Processing portal usagestats");
- portalStats();
+ logger.info("Processing portal usagestats");
+ portalLogs();
logger.info("Portal usagestats process done");
-/*****
- logger.info("ViewsStats processing starts");
- viewsStats();
- logger.info("ViewsStats processing ends");
-
- logger.info("DownloadsStats processing starts");
- downloadsStats();
- logger.info("DownloadsStats processing starts");
-*****/
logger.info("Updating Production Tables");
updateProdTables();
logger.info("Updated Production Tables");
+ logger.info("Create Pedocs Tables");
+ createPedocsOldUsageData();
+ logger.info("Pedocs Tables Created");
} catch (Exception e) {
logger.error("Failed to process logs: " + e);
@@ -237,65 +195,65 @@ public class PiwikStatsDB {
logger.info("Added JSON Serde jar");
logger.info("Dropping piwiklogtmp_json table");
- String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp_json";
+ String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp_json";
stmt.executeUpdate(drop_piwiklogtmp_json);
logger.info("Dropped piwiklogtmp_json table");
logger.info("Creating piwiklogtmp_json");
- String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp_json(\n" +
- " `idSite` STRING,\n" +
- " `idVisit` STRING,\n" +
- " `country` STRING,\n" +
- " `referrerName` STRING,\n" +
- " `browser` STRING,\n" +
- " `actionDetails` ARRAY<\n" +
- " struct<\n" +
- " type: STRING,\n" +
- " url: STRING,\n" +
- " `customVariables`: struct<\n" +
- " `1`: struct<\n" +
- " `customVariablePageValue1`: STRING\n" +
- " >\n" +
- " >,\n" +
- " timestamp: String\n" +
- " >\n" +
- " >\n" +
- ")\n" +
- "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
- "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" +
- "TBLPROPERTIES (\"transactional\"=\"false\")";
+ String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp_json(\n"
+ + " `idSite` STRING,\n"
+ + " `idVisit` STRING,\n"
+ + " `country` STRING,\n"
+ + " `referrerName` STRING,\n"
+ + " `browser` STRING,\n"
+ + " `actionDetails` ARRAY<\n"
+ + " struct<\n"
+ + " type: STRING,\n"
+ + " url: STRING,\n"
+ + " `customVariables`: struct<\n"
+ + " `1`: struct<\n"
+ + " `customVariablePageValue1`: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " timestamp: String\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_piwiklogtmp_json);
logger.info("Created piwiklogtmp_json");
logger.info("Dropping piwiklogtmp table");
- String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp";
+ String drop_piwiklogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp";
stmt.executeUpdate(drop_piwiklogtmp);
logger.info("Dropped piwiklogtmp");
logger.info("Creating piwiklogtmp");
- String create_piwiklogtmp = "CREATE TABLE " +
- ConnectDB.getUsageStatsDBSchema() +
- ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " +
- "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ String create_piwiklogtmp = "CREATE TABLE "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_piwiklogtmp);
logger.info("Created piwiklogtmp");
logger.info("Inserting into piwiklogtmp");
- String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " +
- "actiondetail.type as action, actiondetail.url as url, " +
- "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
- "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
- "referrerName as referrer_name, browser as agent\n" +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" +
- "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, "
+ + "actiondetail.type as action, actiondetail.url as url, "
+ + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, "
+ + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, "
+ + "referrerName as referrer_name, browser as agent\n"
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n"
+ + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_piwiklogtmp);
logger.info("Inserted into piwiklogtmp");
@@ -308,33 +266,31 @@ public class PiwikStatsDB {
logger.info("Cleaning download double clicks");
// clean download double clicks
- String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "WHERE EXISTS (\n" +
- "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
- ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
- "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
- +
- "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" +
- "AND p1.timestamp\n" +
- " >\n" +
- ")\n" +
- "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
- "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" +
- "TBLPROPERTIES (\"transactional\"=\"false\")";
+ String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json("
+ + " `idSite` STRING,\n"
+ + " `idVisit` STRING,\n"
+ + " `country` STRING,\n"
+ + " `referrerName` STRING,\n"
+ + " `browser` STRING,\n"
+ + " `actionDetails` ARRAY<\n"
+ + " struct<\n"
+ + " type: STRING,\n"
+ + " url: STRING,\n"
+ + " timestamp: String\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_process_portal_log_tmp_json);
logger.info("Created process_portal_log_tmp_json");
logger.info("Droping process_portal_log_tmp table");
- String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".process_portal_log_tmp";
+ String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp";
stmt.executeUpdate(drop_process_portal_log_tmp);
logger.info("Dropped process_portal_log_tmp");
logger.info("Creating process_portal_log_tmp");
- String create_process_portal_log_tmp = "CREATE TABLE " +
- ConnectDB.getUsageStatsDBSchema() +
- ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " +
- "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
- "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ String create_process_portal_log_tmp = "CREATE TABLE "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_process_portal_log_tmp);
logger.info("Created process_portal_log_tmp");
logger.info("Inserting into process_portal_log_tmp");
String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".process_portal_log_tmp " +
- "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
- +
- "actiondetail.url as url, " +
- "CASE\n" +
- " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " +
- " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " +
- " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
- +
- " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " +
- " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " +
- " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " +
- " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " +
- " ELSE '' " +
- "END AS entity_id, " +
- "CASE " +
- " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " +
- " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " +
- " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " +
- " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " +
- " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " +
- " ELSE '' " +
- "END AS source_item_type, " +
- "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " +
- "browser as agent " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " +
- "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ + ".process_portal_log_tmp "
+ + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+ + "actiondetail.url as url, "
+ + "CASE\n"
+ + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] "
+ + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] "
+ + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
+ + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] "
+ + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] "
+ + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] "
+ + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] "
+ + " ELSE '' "
+ + "END AS entity_id, "
+ + "CASE "
+ + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' "
+ + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' "
+ + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' "
+ + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' "
+ + " ELSE '' "
+ + "END AS source_item_type, "
+ + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, "
+ + "browser as agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json "
+ + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_process_portal_log_tmp);
logger.info("Inserted into process_portal_log_tmp");
stmt.close();
}
- public void portalStats() throws SQLException {
+ public void portalLogs() throws SQLException {
Connection con = ConnectDB.getHiveConnection();
Statement stmt = con.createStatement();
con.setAutoCommit(false);
-// Original queries where of the style
-//
-// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp2,
-// openaire_prod_stats_20200821.result_oids roid
-// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null
-//
-// The following query is an example of how queries should be
-//
-//
-// INSERT INTO usagestats_20200907.piwiklogtmp
-// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp
-// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id
-// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL);
-//
-// We should consider if we would like the queries to be as the following
-//
-// INSERT INTO usagestats_20200907.piwiklogtmp
-// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
-// FROM usagestats_20200907.process_portal_log_tmp
-// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id
-// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND
-// roid.oid != '');
-
logger.info("PortalStats - Step 1");
- String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".result_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
logger.info("PortalStats - Step 2");
stmt = con.createStatement();
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".datasource_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
@@ -494,12 +421,11 @@ public class PiwikStatsDB {
*/
logger.info("PortalStats - Step 3");
stmt = con.createStatement();
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
- +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
- "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
- "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".project_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
@@ -512,233 +438,233 @@ public class PiwikStatsDB {
logger.info("Cleaning oai - Step 1");
stmt = ConnectDB.getHiveConnection().createStatement();
- String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," +
- "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
+ String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/',"
+ + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 2");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," +
- "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/',"
+ + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 3");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," +
- "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/',"
+ + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 4");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," +
- "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/',"
+ + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 5");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," +
- "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/',"
+ + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 6");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," +
- "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/',"
+ + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 7");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," +
- "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/',"
+ + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 8");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," +
- "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/',"
+ + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 9");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," +
- "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/',"
+ + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 10");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," +
- "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/',"
+ + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 11");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," +
- "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/',"
+ + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 12");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," +
- "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/',"
+ + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 13");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," +
- "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/',"
+ + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 14");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," +
- "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/',"
+ + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 15");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," +
- "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/',"
+ + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 16");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," +
- "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/',"
+ + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 17");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," +
- "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/',"
+ + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 18");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," +
- "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/',"
+ + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 19");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," +
- "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/',"
+ + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 20");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," +
- "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/',"
+ + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 21");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," +
- "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/',"
+ + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 22");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," +
- "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/',"
+ + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 23");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," +
- "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/',"
+ + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 24");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," +
- "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/',"
+ + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 25");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," +
- "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/',"
+ + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 26");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," +
- "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/',"
+ + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 27");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," +
- "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/',"
+ + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 28");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," +
- "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/',"
+ + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 29");
stmt = ConnectDB.getHiveConnection().createStatement();
- sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
- "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," +
- "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/',"
+ + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
@@ -746,63 +672,83 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().close();
}
- private String processPortalURL(String url) {
-
- if (url.indexOf("explore.openaire.eu") > 0) {
- try {
- url = URLDecoder.decode(url, "UTF-8");
- } catch (Exception e) {
- logger.info("Error when decoding the following URL: " + url);
- }
- if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
- url = "datasource|"
- + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
- } else if (url.indexOf("datasource=") > 0
- && url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
- url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
- } else if (url.indexOf("datasourceFilter=") > 0
- && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
- url = "datasource|"
- + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
- } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
- url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
- } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
- url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
- } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46
- && !url.contains("oai:dnet:corda")) {
- url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
- } else if (url.indexOf("organizationId=") > 0
- && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
- url = "organization|"
- + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
- } else {
- url = "";
- }
- } else {
- url = "";
- }
-
- return url;
- }
-
private void updateProdTables() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Inserting data to piwiklog");
- String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
-
+
logger.info("Dropping piwiklogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
- logger.info("Dropped piwiklogtmp");
-
+ logger.info("Dropped piwiklogtmp");
+
logger.info("Dropping process_portal_log_tmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
stmt.executeUpdate(sql);
- logger.info("Dropped process_portal_log_tmp");
+ logger.info("Dropped process_portal_log_tmp");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ }
+
+ public void finalizeStats() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Dropping piwiklogtmp");
+ String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Dropping process_portal_log_tmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped process_portal_log_tmp");
+
+ logger.info("Dropping irus_sushilogtmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped irus_sushilogtmp");
+
+ logger.info("Dropping irus_sushilogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped irus_sushilogtmp_json");
+
+ logger.info("Dropping lareferencialogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped lareferencialogtmp_json");
+
+ logger.info("Dropping piwiklogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogtmp_json");
+
+ logger.info("Dropping process_portal_log_tmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped process_portal_log_tmp_json");
+
+ logger.info("Dropping sarc_sushilogtmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp");
+
+ logger.info("Dropping sarc_sushilogtmp_json_array");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp_json_array");
+
+ logger.info("Dropping sarc_sushilogtmp_json_non_array");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp_json_non_array");
stmt.close();
ConnectDB.getHiveConnection().close();
@@ -868,4 +814,22 @@ public class PiwikStatsDB {
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}
+
+ public void createPedocsOldUsageData() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating PeDocs Old Views Table");
+ String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsoldviews as select * from default.pedocsviews";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Views Table created");
+
+ logger.info("Creating PeDocs Old Downloads Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsolddownloads as select * from default.pedocsdownloads";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Downloads Table created");
+
+ }
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java
index 4ca20c52e..e85c972f5 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@@ -33,543 +34,467 @@ import org.slf4j.LoggerFactory;
*/
public class SarcStats {
- private Statement stmtHive = null;
- private Statement stmtImpala = null;
+ private Statement stmtHive = null;
+ private Statement stmtImpala = null;
- private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
+ private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
- public SarcStats() throws Exception {
+ public SarcStats() throws Exception {
// createTables();
- }
+ }
- private void createTables() throws Exception {
- try {
+ private void createTables() throws Exception {
+ try {
- stmtHive = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
- stmtHive.executeUpdate(sqlCreateTableSushiLog);
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
+ stmtHive.executeUpdate(sqlCreateTableSushiLog);
- // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
- // stmt.executeUpdate(sqlCopyPublicSushiLog);
- String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
- + " ON INSERT TO sushilog "
- + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
- + "sushilog.rid, sushilog.date "
- + "FROM sushilog "
- + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
- stmtHive.executeUpdate(sqlcreateRuleSushiLog);
- String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
- stmtHive.executeUpdate(createSushiIndex);
+ // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
+ // stmt.executeUpdate(sqlCopyPublicSushiLog);
+ String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ + " ON INSERT TO sushilog "
+ + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ + "sushilog.rid, sushilog.date "
+ + "FROM sushilog "
+ + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+ stmtHive.executeUpdate(sqlcreateRuleSushiLog);
+ String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
+ stmtHive.executeUpdate(createSushiIndex);
- stmtHive.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Sushi Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
- public void reCreateLogDirs() throws IOException {
- FileSystem dfs = FileSystem.get(new Configuration());
+ public void reCreateLogDirs() throws IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
- logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
- dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
+ logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
- logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
- dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
+ logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
- logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
- dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
+ logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
- logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
- dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
- }
+ logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
+ }
- public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- logger.info("Adding JSON Serde jar");
- stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
- logger.info("Added JSON Serde jar");
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
- logger.info("Dropping sarc_sushilogtmp_json_array table");
- String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
- stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
- logger.info("Dropped sarc_sushilogtmp_json_array table");
+ logger.info("Dropping sarc_sushilogtmp_json_array table");
+ String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
+ logger.info("Dropped sarc_sushilogtmp_json_array table");
- logger.info("Creating sarc_sushilogtmp_json_array table");
- String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
- + " `ItemIdentifier` ARRAY<\n"
- + " struct<\n"
- + " `Type`: STRING,\n"
- + " `Value`: STRING\n"
- + " >\n"
- + " >,\n"
- + " `ItemPerformance` struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >\n"
- + ")"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + sarcsReportPathArray + "/'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
- logger.info("Created sarc_sushilogtmp_json_array table");
+ logger.info("Creating sarc_sushilogtmp_json_array table");
+ String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
+ logger.info("Created sarc_sushilogtmp_json_array table");
- logger.info("Dropping sarc_sushilogtmp_json_non_array table");
- String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_json_non_array";
- stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
- logger.info("Dropped sarc_sushilogtmp_json_non_array table");
+ logger.info("Dropping sarc_sushilogtmp_json_non_array table");
+ String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
+ logger.info("Dropped sarc_sushilogtmp_json_non_array table");
- logger.info("Creating sarc_sushilogtmp_json_non_array table");
- String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
- + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
- + " `ItemIdentifier` struct<\n"
- + " `Type`: STRING,\n"
- + " `Value`: STRING\n"
- + " >,\n"
- + " `ItemPerformance` struct<\n"
- + " `Period`: struct<\n"
- + " `Begin`: STRING,\n"
- + " `End`: STRING\n"
- + " >,\n"
- + " `Instance`: struct<\n"
- + " `Count`: STRING,\n"
- + " `MetricType`: STRING\n"
- + " >\n"
- + " >"
- + ")"
- + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
- + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
- + "TBLPROPERTIES (\"transactional\"=\"false\")";
- stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
- logger.info("Created sarc_sushilogtmp_json_non_array table");
+ logger.info("Creating sarc_sushilogtmp_json_non_array table");
+ String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
+ + " `ItemIdentifier` struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
+ logger.info("Created sarc_sushilogtmp_json_non_array table");
- logger.info("Creating sarc_sushilogtmp table");
- String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp(source STRING, repository STRING, "
- + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
- + "tblproperties('transactional'='true')";
- stmt.executeUpdate(create_sarc_sushilogtmp);
- logger.info("Created sarc_sushilogtmp table");
+ logger.info("Creating sarc_sushilogtmp table");
+ String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_sarc_sushilogtmp);
+ logger.info("Created sarc_sushilogtmp table");
- logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
- String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
- + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
- + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
- + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
- + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
- + "WHERE `ItemIdent`.`Type`='DOI'";
- stmt.executeUpdate(insert_sarc_sushilogtmp);
- logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+ String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "WHERE `ItemIdent`.`Type`='DOI'";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
- logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
- insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
- + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
- + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
- + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
- stmt.executeUpdate(insert_sarc_sushilogtmp);
- logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+ insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
- ConnectDB.getHiveConnection().close();
- }
+ ConnectDB.getHiveConnection().close();
+ }
- public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+ public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmt.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
+ logger.info("Creating sushilog table");
+ String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog "
+ + "(`source` string, "
+ + "`repository` string, "
+ + "`rid` string, "
+ + "`date` string, "
+ + "`metric_type` string, "
+ + "`count` int)";
+ stmt.executeUpdate(createSushilog);
+ logger.info("Created sushilog table");
- logger.info("Dropping sarc_sushilogtmp table");
- String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp";
- stmt.executeUpdate(drop_sarc_sushilogtmp);
- logger.info("Dropped sarc_sushilogtmp table");
- ConnectDB.getHiveConnection().close();
+ logger.info("Dropping sarc_sushilogtmp table");
+ String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp";
+ stmt.executeUpdate(drop_sarc_sushilogtmp);
+ logger.info("Dropped sarc_sushilogtmp table");
+ ConnectDB.getHiveConnection().close();
- List issnAndUrls = new ArrayList();
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
- });
- issnAndUrls.add(new String[]{
- "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
- });
- issnAndUrls.add(new String[]{
- "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
- });
+ List issnAndUrls = new ArrayList();
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
+ });
+ issnAndUrls.add(new String[] {
+ "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
+ });
- if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
- && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
- logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
- issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
- }
+ if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
+ && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ }
- logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
+ logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
- for (String[] issnAndUrl : issnAndUrls) {
- logger.info("Now working on ISSN: " + issnAndUrl[1]);
- getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
- }
+ for (String[] issnAndUrl : issnAndUrls) {
+ logger.info("Now working on ISSN: " + issnAndUrl[1]);
+ getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
+ }
- }
+ }
- public void finalizeSarcStats() throws Exception {
- stmtHive = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
- stmtImpala = ConnectDB.getImpalaConnection().createStatement();
-/*
- logger.info("Creating downloads_stats table_tmp");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_tmp "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmtHive.executeUpdate(createDownloadsStats);
- logger.info("Created downloads_stats_tmp table");
+ public void updateSarcLogs() throws Exception {
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+ stmtImpala = ConnectDB.getImpalaConnection().createStatement();
- logger.info("Dropping sarc_sushilogtmp_impala table");
- String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala";
- stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala);
- logger.info("Dropped sarc_sushilogtmp_impala table");
+ // Insert into sushilog
+ logger.info("Inserting into sushilog");
+ String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmtHive.executeUpdate(insertSushiLog);
+ logger.info("Inserted into sushilog");
- logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala");
- String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala "
- + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
- stmtHive.executeUpdate(createSarcSushilogtmpImpala);
- logger.info("Created sarc_sushilogtmp_impala");
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ }
- logger.info("Making sarc_sushilogtmp visible to impala");
- String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_sushilogtmp_impala;";
- stmtImpala.executeUpdate(invalidateMetadata);
+ public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
+ String url, String issn) throws Exception {
+ logger.info("Processing SARC! issn: " + issn + " with url: " + url);
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- logger.info("Dropping downloads_stats_impala table");
- String drop_downloads_stats_impala = "DROP TABLE IF EXISTS "
- + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala";
- stmtHive.executeUpdate(drop_downloads_stats_impala);
- logger.info("Dropped downloads_stats_impala table");
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
- logger.info("Making downloads_stats_impala deletion visible to impala");
- try {
- String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala;";
- stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala);
- } catch (SQLException sqle) {
- }
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
- // We run the following query in Impala because it is faster
- logger.info("Creating downloads_stats_impala");
- String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_impala AS "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
- + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_pids ro "
- + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
- + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
- stmtImpala.executeUpdate(createDownloadsStatsImpala);
- logger.info("Creating downloads_stats_impala");
+ logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
- // Insert into downloads_stats
- logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp");
- String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".downloads_stats_tmp SELECT * "
- + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala";
- stmtHive.executeUpdate(insertDStats);
- logger.info("Inserted into downloads_stats_tmp");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, issn);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
- logger.info("Creating sushilog table");
- String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`rid` string, "
- + "`date` string, "
- + "`metric_type` string, "
- + "`count` int)";
- stmtHive.executeUpdate(createSushilog);
- logger.info("Created sushilog table");
-*/
- // Insert into sushilog
- logger.info("Inserting into sushilog");
- String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
- + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
- stmtHive.executeUpdate(insertSushiLog);
- logger.info("Inserted into sushilog");
+ // Creating the needed configuration for the correct storing of data
+ Configuration config = new Configuration();
+ config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
+ config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
+ config
+ .set(
+ "fs.hdfs.impl",
+ org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ config
+ .set(
+ "fs.file.impl",
+ org.apache.hadoop.fs.LocalFileSystem.class.getName());
+ FileSystem dfs = FileSystem.get(config);
- stmtHive.close();
- ConnectDB.getHiveConnection().close();
- }
+ if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
+ } else {
+ start.add(Calendar.MONTH, 1);
+ while (start.before(end)) {
+ String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
+ start.add(Calendar.MONTH, 1);
- public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
- String url, String issn) throws Exception {
- logger.info("Processing SARC! issn: " + issn + " with url: " + url);
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ logger.info("(getARReport) Getting report: " + reportUrl);
+ String text = getJson(reportUrl);
+ if (text == null) {
+ continue;
+ }
- SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
- // Setting the starting period
- Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
- logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = null;
+ try {
+ jsonObject = (JSONObject) parser.parse(text);
+ } // if there is a parsing error continue with the next url
+ catch (ParseException pe) {
+ continue;
+ }
- // Setting the ending period (last day of the month)
- Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
- end.add(Calendar.MONTH, +1);
- end.add(Calendar.DAY_OF_MONTH, -1);
- logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+ jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("sc:Report");
+ if (jsonObject == null) {
+ continue;
+ }
+ jsonObject = (JSONObject) jsonObject.get("c:Report");
+ jsonObject = (JSONObject) jsonObject.get("c:Customer");
+ Object obj = jsonObject.get("c:ReportItems");
+ JSONArray jsonArray = new JSONArray();
+ if (obj instanceof JSONObject) {
+ jsonArray.add(obj);
+ } else {
+ jsonArray = (JSONArray) obj;
+ // jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
+ }
+ if (jsonArray == null) {
+ continue;
+ }
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- PreparedStatement st = ConnectDB
- .getHiveConnection()
- .prepareStatement(
- "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
- st.setString(1, issn);
- ResultSet rs_date = st.executeQuery();
- Date dateMax = null;
- while (rs_date.next()) {
- if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
- && !rs_date.getString(1).equals("")) {
- start.setTime(sdf.parse(rs_date.getString(1)));
- dateMax = sdf.parse(rs_date.getString(1));
- }
- }
- rs_date.close();
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathArray);
+ FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
- // Creating the needed configuration for the correct storing of data
- Configuration config = new Configuration();
- config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
- config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
- config
- .set(
- "fs.hdfs.impl",
- org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- config
- .set(
- "fs.file.impl",
- org.apache.hadoop.fs.LocalFileSystem.class.getName());
- FileSystem dfs = FileSystem.get(config);
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathNonArray);
+ FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
- if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
- logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
- } else {
-
- while (start.before(end)) {
- String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
- + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
- start.add(Calendar.MONTH, 1);
+ for (Object aJsonArray : jsonArray) {
- logger.info("(getARReport) Getting report: " + reportUrl);
- String text = getJson(reportUrl);
- if (text == null) {
- continue;
- }
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ renameKeysRecursively(":", jsonObjectRow);
- JSONParser parser = new JSONParser();
- JSONObject jsonObject = null;
- try {
- jsonObject = (JSONObject) parser.parse(text);
- } // if there is a parsing error continue with the next url
- catch (ParseException pe) {
- continue;
- }
+ if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
+ finNonArray.write(jsonObjectRow.toJSONString().getBytes());
+ finNonArray.writeChar('\n');
+ } else {
+ finArray.write(jsonObjectRow.toJSONString().getBytes());
+ finArray.writeChar('\n');
+ }
+ }
- jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
- jsonObject = (JSONObject) jsonObject.get("sc:Report");
- if (jsonObject == null) {
- continue;
- }
- jsonObject = (JSONObject) jsonObject.get("c:Report");
- jsonObject = (JSONObject) jsonObject.get("c:Customer");
- Object obj = jsonObject.get("c:ReportItems");
- JSONArray jsonArray = new JSONArray();
- if (obj instanceof JSONObject) {
- jsonArray.add(obj);
- } else {
- jsonArray = (JSONArray) obj;
- // jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
- }
- if (jsonArray == null) {
- continue;
- }
+ finArray.close();
+ finNonArray.close();
- // Creating the file in the filesystem for the ItemIdentifier as array object
- String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
- + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePathArray);
- FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
+ // Check the file size and if it is too big, delete it
+ File fileArray = new File(filePathArray);
+ if (fileArray.length() == 0) {
+ fileArray.delete();
+ }
+ File fileNonArray = new File(filePathNonArray);
+ if (fileNonArray.length() == 0) {
+ fileNonArray.delete();
+ }
- // Creating the file in the filesystem for the ItemIdentifier as array object
- String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
- + simpleDateFormat.format(start.getTime()) + ".json";
- logger.info("Storing to file: " + filePathNonArray);
- FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
+ }
- for (Object aJsonArray : jsonArray) {
+ dfs.close();
+ }
+ // ConnectDB.getHiveConnection().close();
+ }
- JSONObject jsonObjectRow = (JSONObject) aJsonArray;
- renameKeysRecursively(":", jsonObjectRow);
-
- if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
- finNonArray.write(jsonObjectRow.toJSONString().getBytes());
- finNonArray.writeChar('\n');
- } else {
- finArray.write(jsonObjectRow.toJSONString().getBytes());
- finArray.writeChar('\n');
- }
- }
-
- finArray.close();
- finNonArray.close();
-
- // Check the file size and if it is too big, delete it
- File fileArray = new File(filePathArray);
- if (fileArray.length() == 0)
- fileArray.delete();
- File fileNonArray = new File(filePathNonArray);
- if (fileNonArray.length() == 0)
- fileNonArray.delete();
-
- }
-
- dfs.close();
- }
- //ConnectDB.getHiveConnection().close();
- }
-
- private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
- for (Object jjval : givenJsonObj) {
- if (jjval instanceof JSONArray) {
- renameKeysRecursively(delimiter, (JSONArray) jjval);
- } else if (jjval instanceof JSONObject) {
- renameKeysRecursively(delimiter, (JSONObject) jjval);
- } // All other types of vals
- else
+ private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
+ for (Object jjval : givenJsonObj) {
+ if (jjval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jjval);
+ } else if (jjval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jjval);
+ } // All other types of vals
+ else
;
- }
- }
+ }
+ }
- private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
- Set jkeys = new HashSet(givenJsonObj.keySet());
- for (String jkey : jkeys) {
+ private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
+ Set jkeys = new HashSet(givenJsonObj.keySet());
+ for (String jkey : jkeys) {
- String[] splitArray = jkey.split(delimiter);
- String newJkey = splitArray[splitArray.length - 1];
+ String[] splitArray = jkey.split(delimiter);
+ String newJkey = splitArray[splitArray.length - 1];
- Object jval = givenJsonObj.get(jkey);
- givenJsonObj.remove(jkey);
- givenJsonObj.put(newJkey, jval);
+ Object jval = givenJsonObj.get(jkey);
+ givenJsonObj.remove(jkey);
+ givenJsonObj.put(newJkey, jval);
- if (jval instanceof JSONObject) {
- renameKeysRecursively(delimiter, (JSONObject) jval);
- }
+ if (jval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jval);
+ }
- if (jval instanceof JSONArray) {
- renameKeysRecursively(delimiter, (JSONArray) jval);
- }
- }
- }
+ if (jval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jval);
+ }
+ }
+ }
- private String getJson(String url) throws Exception {
- // String cred=username+":"+password;
- // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
- try {
- URL website = new URL(url);
- URLConnection connection = website.openConnection();
- // connection.setRequestProperty ("Authorization", "Basic "+encoded);
- StringBuilder response;
- try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
- response = new StringBuilder();
- String inputLine;
- while ((inputLine = in.readLine()) != null) {
- response.append(inputLine);
- response.append("\n");
- }
- }
- return response.toString();
- } catch (Exception e) {
+ private String getJson(String url) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
- // Logging error and silently continuing
- logger.error("Failed to get URL: " + e);
- System.out.println("Failed to get URL: " + e);
+ // Logging error and silently continuing
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
// return null;
// throw new Exception("Failed to get URL: " + e.toString(), e);
- }
- return "";
- }
+ }
+ return "";
+ }
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java
index bf2187569..07e15605f 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java
@@ -13,7 +13,7 @@ import org.slf4j.LoggerFactory;
/**
* Main class for downloading and processing Usage statistics
- *
+ *
* @author D. Pierrakos, S. Zoupanos
*/
public class UsageStatsExporter {
@@ -51,19 +51,13 @@ public class UsageStatsExporter {
logger.info("Initialising DB properties");
ConnectDB.init();
-// runImpalaQuery();
-
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Re-creating database and tables");
- if (ExecuteWorkflow.recreateDbAndTables){
+ if (ExecuteWorkflow.recreateDbAndTables) {
piwikstatsdb.recreateDBAndTables();
- logger.info("DB-Tables-TmpTables are created ");
- }
-// else {
-// piwikstatsdb.createTmpTables();
-// logger.info("TmpTables are created ");
-// }
+ logger.info("DB-Tables-TmpTables are created ");
+ }
logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
@@ -106,9 +100,8 @@ public class UsageStatsExporter {
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs");
}
-
- LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
+ LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
if (ExecuteWorkflow.processLaReferenciaLogs) {
logger.info("Processing LaReferencia logs");
@@ -116,7 +109,6 @@ public class UsageStatsExporter {
logger.info("LaReferencia logs done");
}
-
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
logger.info("Creating Irus Stats tables");
@@ -132,14 +124,11 @@ public class UsageStatsExporter {
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
}
-
- if (ExecuteWorkflow.irusProcessStats) {
+ if (ExecuteWorkflow.irusProcessStats) {
irusstats.processIrusStats();
logger.info("Irus done");
}
-
-
SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
sarcStats.reCreateLogDirs();
@@ -148,51 +137,70 @@ public class UsageStatsExporter {
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
}
-
- if (ExecuteWorkflow.sarcProcessStats) {
+ if (ExecuteWorkflow.sarcProcessStats) {
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
- sarcStats.finalizeSarcStats();
+ sarcStats.updateSarcLogs();
}
logger.info("Sarc done");
-
-
-/*
// finalize usagestats
+
+ logger.info("Dropping tmp tables");
if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats();
- logger.info("Finalized stats");
+ logger.info("Dropped tmp tables");
}
-*/
-/*
- // Make the tables available to Impala
- if (ExecuteWorkflow.finalTablesVisibleToImpala) {
- logger.info("Making tables visible to Impala");
- invalidateMetadata();
- }
-*/
-
- logger.info("End");
+ logger.info("Raw Data Download End");
}
- private void invalidateMetadata() throws SQLException {
- Statement stmt = null;
+ public void createdDBWithTablesOnly() throws Exception {
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
- stmt = ConnectDB.getImpalaConnection().createStatement();
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+ piwikstatsdb.recreateDBAndTables();
- String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
+ piwikstatsdb.createPedocsOldUsageData();
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+
+ logger.info("Creating sushilog");
+
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog(source STRING, "
+ + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableSushiLog);
+ logger.info("Created sushilog");
+
+ logger.info("Updating piwiklog");
+ String sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog select * from openaire_prod_usage_raw.piwiklog";
stmt.executeUpdate(sql);
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
+ logger.info("Updating lareferencialog");
+ sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog";
stmt.executeUpdate(sql);
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
+ logger.info("Updating sushilog");
+ sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog select * from openaire_prod_usage_raw.sushilog";
stmt.executeUpdate(sql);
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
- stmt.executeUpdate(sql);
-
- stmt.close();
+ stmt.close();
ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+
}
+
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json
index 988c23b48..1aa5ad6f8 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json
@@ -125,12 +125,6 @@
"paramDescription": "Starting log period",
"paramRequired": true
},
- {
- "paramName": "elp",
- "paramLongName": "endingLogPeriod",
- "paramDescription": "Ending log period",
- "paramRequired": true
- },
{
"paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload",
@@ -216,12 +210,6 @@
"paramDescription": "Create the usage_stats table?",
"paramRequired": true
},
- {
- "paramName": "ftvi",
- "paramLongName": "finalTablesVisibleToImpala",
- "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
- "paramRequired": true
- },
{
"paramName": "nodt",
"paramLongName": "numberOfDownloadThreads",
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml
index a6600516d..022a107ab 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml
@@ -63,7 +63,6 @@
--downloadPiwikLogs${downloadPiwikLogs}
--processPiwikLogs${processPiwikLogs}
--startingLogPeriod${startingLogPeriod}
- --endingLogPeriod${endingLogPeriod}
--numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload}
--numberOfSiteIdsToDownload${numberOfSiteIdsToDownload}
--laReferenciaEmptyDirs${laReferenciaEmptyDirs}
@@ -78,7 +77,6 @@
--sarcProcessStats${sarcProcessStats}
--sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
--finalizeStats${finalizeStats}
- --finalTablesVisibleToImpala${finalTablesVisibleToImpala}
--numberOfDownloadThreads${numberOfDownloadThreads}
diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml
index f400239f5..79fabb603 100644
--- a/dhp-workflows/dhp-usage-stats-build/pom.xml
+++ b/dhp-workflows/dhp-usage-stats-build/pom.xml
@@ -23,7 +23,35 @@
4.0.0
dhp-usage-stats-build
-
+
+
+
+ pl.project13.maven
+ git-commit-id-plugin
+ 2.1.15
+
+
+
+ revision
+
+
+
+
+ ${project.basedir}/../.git
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.6.1
+
+
+ 1.8
+
+
+
+
UTF-8
UTF-8
diff --git a/dhp-workflows/dhp-usage-stats-build/runworkflow.sh b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh
new file mode 100755
index 000000000..191fb24c6
--- /dev/null
+++ b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh
@@ -0,0 +1 @@
+mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java
index 8f0f8eae7..e53709f1a 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java
@@ -3,12 +3,17 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
+
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+import java.util.Date;
import java.util.Properties;
import org.apache.log4j.Logger;
@@ -23,108 +28,120 @@ import com.mchange.v2.c3p0.ComboPooledDataSource;
public abstract class ConnectDB {
- public static Connection DB_HIVE_CONNECTION;
- public static Connection DB_IMPALA_CONNECTION;
+ public static Connection DB_HIVE_CONNECTION;
+ public static Connection DB_IMPALA_CONNECTION;
- private static String dbHiveUrl;
- private static String dbImpalaUrl;
- private static String usageRawDataDBSchema;
- private static String usageStatsDBSchema;
- private static String statsDBSchema;
- private final static Logger log = Logger.getLogger(ConnectDB.class);
+ private static String dbHiveUrl;
+ private static String dbImpalaUrl;
+ private static String usageRawDataDBSchema;
+ private static String usageStatsDBSchema;
+ private static String usagestatsPermanentDBSchema;
+ private static String statsDBSchema;
+ private final static Logger log = Logger.getLogger(ConnectDB.class);
- static void init() throws ClassNotFoundException {
+ static void init() throws ClassNotFoundException {
- dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
- dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
- usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
- statsDBSchema = ExecuteWorkflow.statsDBSchema;
- usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema;
+ dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
+ dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
+ usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
+ statsDBSchema = ExecuteWorkflow.statsDBSchema;
+ usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema;
+ usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema;
- Class.forName("org.apache.hive.jdbc.HiveDriver");
- }
+ Class.forName("org.apache.hive.jdbc.HiveDriver");
+ }
- public static Connection getHiveConnection() throws SQLException {
- if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
- return DB_HIVE_CONNECTION;
- } else {
- DB_HIVE_CONNECTION = connectHive();
+ public static Connection getHiveConnection() throws SQLException {
+ if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
+ return DB_HIVE_CONNECTION;
+ } else {
+ DB_HIVE_CONNECTION = connectHive();
- return DB_HIVE_CONNECTION;
- }
- }
+ return DB_HIVE_CONNECTION;
+ }
+ }
- public static Connection getImpalaConnection() throws SQLException {
- if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
- return DB_IMPALA_CONNECTION;
- } else {
- DB_IMPALA_CONNECTION = connectImpala();
+ public static Connection getImpalaConnection() throws SQLException {
+ if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
+ return DB_IMPALA_CONNECTION;
+ } else {
+ DB_IMPALA_CONNECTION = connectImpala();
- return DB_IMPALA_CONNECTION;
- }
- }
+ return DB_IMPALA_CONNECTION;
+ }
+ }
- public static String getUsageRawDataDBSchema() {
- return usageRawDataDBSchema;
- }
+ public static String getUsageRawDataDBSchema() {
+ return ConnectDB.usageRawDataDBSchema;
+ }
- public static String getUsageStatsDBSchema() {
- return ConnectDB.usageStatsDBSchema;
- }
+ public static String getUsageStatsDBSchema() {
+ String datePattern = "YYYYMMdd";
+ DateFormat df = new SimpleDateFormat(datePattern);
+// Get the today date using Calendar object.
+ Date today = Calendar.getInstance().getTime();
+ String todayAsString = df.format(today);
- public static String getStatsDBSchema() {
- return ConnectDB.statsDBSchema;
- }
+ return ConnectDB.usageStatsDBSchema + "_" + todayAsString;
+ }
- private static Connection connectHive() throws SQLException {
- /*
+ public static String getStatsDBSchema() {
+ return ConnectDB.statsDBSchema;
+ }
+
+ public static String getUsagestatsPermanentDBSchema() {
+ return ConnectDB.usagestatsPermanentDBSchema;
+ }
+
+ private static Connection connectHive() throws SQLException {
+ /*
* Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
* connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbHiveUrl);
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbHiveUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
- cpds.setAcquireRetryAttempts(30);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
+ cpds.setAcquireRetryAttempts(30);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
- return cpds.getConnection();
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+ return cpds.getConnection();
- }
+ }
- private static Connection connectImpala() throws SQLException {
- /*
+ private static Connection connectImpala() throws SQLException {
+ /*
* Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
* connection.createStatement(); log.debug("Opened database successfully"); return connection;
- */
- ComboPooledDataSource cpds = new ComboPooledDataSource();
- cpds.setJdbcUrl(dbImpalaUrl);
- cpds.setAcquireIncrement(1);
- cpds.setMaxPoolSize(100);
- cpds.setMinPoolSize(1);
- cpds.setInitialPoolSize(1);
- cpds.setMaxIdleTime(300);
- cpds.setMaxConnectionAge(36000);
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbImpalaUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
- cpds.setAcquireRetryAttempts(30);
- cpds.setAcquireRetryDelay(2000);
- cpds.setBreakAfterAcquireFailure(false);
+ cpds.setAcquireRetryAttempts(30);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
- cpds.setCheckoutTimeout(0);
- cpds.setPreferredTestQuery("SELECT 1");
- cpds.setIdleConnectionTestPeriod(60);
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
- return cpds.getConnection();
+ return cpds.getConnection();
- }
+ }
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java
index 3f958abba..26e44b1f6 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java
@@ -3,6 +3,7 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
+
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.text.SimpleDateFormat;
@@ -11,162 +12,142 @@ import java.util.Date;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.BasicConfigurator;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
/**
* @author D. Pierrakos, S. Zoupanos
*/
public class ExecuteWorkflow {
- static String matomoAuthToken;
- static String matomoBaseURL;
- static String repoLogPath;
- static String portalLogPath;
- static String portalMatomoID;
- static String irusUKBaseURL;
- static String irusUKReportPath;
- static String sarcsReportPathArray;
- static String sarcsReportPathNonArray;
- static String lareferenciaLogPath;
- static String lareferenciaBaseURL;
- static String lareferenciaAuthToken;
- static String dbHiveUrl;
- static String dbImpalaUrl;
- static String usageRawDataDBSchema;
- static String usageStatsDBSchema;
- static String statsDBSchema;
- static boolean recreateDbAndTables;
+// static String matomoAuthToken;
+ static String matomoBaseURL;
+ static String repoLogPath;
+ static String portalLogPath;
+ static String portalMatomoID;
+// static String irusUKBaseURL;
+ static String irusUKReportPath;
+ static String sarcsReportPathArray;
+ static String sarcsReportPathNonArray;
+ static String lareferenciaLogPath;
+// static String lareferenciaBaseURL;
+// static String lareferenciaAuthToken;
+ static String dbHiveUrl;
+ static String dbImpalaUrl;
+ static String usageRawDataDBSchema;
+ static String usageStatsDBSchema;
+ static String usagestatsPermanentDBSchema;
+ static String statsDBSchema;
+ static boolean recreateDbAndTables;
- static boolean piwikEmptyDirs;
- static boolean downloadPiwikLogs;
- static boolean processPiwikLogs;
+ static boolean processPiwikLogs;
+ static boolean processLaReferenciaLogs;
- static Calendar startingLogPeriod;
- static Calendar endingLogPeriod;
- static int numberOfPiwikIdsToDownload;
- static int numberOfSiteIdsToDownload;
+ static boolean irusProcessStats;
- static boolean laReferenciaEmptyDirs;
- static boolean downloadLaReferenciaLogs;
- static boolean processLaReferenciaLogs;
+ static boolean sarcProcessStats;
- static boolean irusCreateTablesEmptyDirs;
- static boolean irusDownloadReports;
- static boolean irusProcessStats;
- static int irusNumberOfOpendoarsToDownload;
+ static boolean finalizeStats;
+ static boolean finalTablesVisibleToImpala;
- static boolean sarcCreateTablesEmptyDirs;
- static boolean sarcDownloadReports;
- static boolean sarcProcessStats;
- static int sarcNumberOfIssnToDownload;
+ static int numberOfDownloadThreads;
- static boolean finalizeStats;
- static boolean finalTablesVisibleToImpala;
+ private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
- static int numberOfDownloadThreads;
+ public static void main(String args[]) throws Exception {
- private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
+ // Sending the logs to the console
+ BasicConfigurator.configure();
- public static void main(String args[]) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ UsageStatsExporter.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json")));
+ parser.parseArgument(args);
- // Sending the logs to the console
- BasicConfigurator.configure();
+ // Setting up the initial parameters
+// matomoAuthToken = parser.get("matomoAuthToken");
+// matomoBaseURL = parser.get("matomoBaseURL");
+ repoLogPath = parser.get("repoLogPath");
+ portalLogPath = parser.get("portalLogPath");
+ portalMatomoID = parser.get("portalMatomoID");
+// irusUKBaseURL = parser.get("irusUKBaseURL");
+ irusUKReportPath = parser.get("irusUKReportPath");
+ sarcsReportPathArray = parser.get("sarcsReportPathArray");
+ sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
+ lareferenciaLogPath = parser.get("lareferenciaLogPath");
+// lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
+// lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
- final ArgumentApplicationParser parser = new ArgumentApplicationParser(
- IOUtils
- .toString(
- UsageStatsExporter.class
- .getResourceAsStream(
- "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json")));
- parser.parseArgument(args);
+ dbHiveUrl = parser.get("dbHiveUrl");
+ dbImpalaUrl = parser.get("dbImpalaUrl");
+ usageRawDataDBSchema = parser.get("usageRawDataDBSchema");
+ usageStatsDBSchema = parser.get("usageStatsDBSchema");
+ usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema");
+ statsDBSchema = parser.get("statsDBSchema");
- // Setting up the initial parameters
- matomoAuthToken = parser.get("matomoAuthToken");
- matomoBaseURL = parser.get("matomoBaseURL");
- repoLogPath = parser.get("repoLogPath");
- portalLogPath = parser.get("portalLogPath");
- portalMatomoID = parser.get("portalMatomoID");
- irusUKBaseURL = parser.get("irusUKBaseURL");
- irusUKReportPath = parser.get("irusUKReportPath");
- sarcsReportPathArray = parser.get("sarcsReportPathArray");
- sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
- lareferenciaLogPath = parser.get("lareferenciaLogPath");
- lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
- lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
+ if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
+ processPiwikLogs = true;
+ } else {
+ processPiwikLogs = false;
+ }
- dbHiveUrl = parser.get("dbHiveUrl");
- dbImpalaUrl = parser.get("dbImpalaUrl");
- usageRawDataDBSchema = parser.get("usageRawDataDBSchema");
- usageStatsDBSchema = parser.get("usageStatsDBSchema");
- statsDBSchema = parser.get("statsDBSchema");
+// String startingLogPeriodStr = parser.get("startingLogPeriod");
+// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
+// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
+//
+// String endingLogPeriodStr = parser.get("endingLogPeriod");
+// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
+// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
- if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
- processPiwikLogs = true;
- } else {
- processPiwikLogs = false;
- }
+ if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
+ recreateDbAndTables = true;
+ } else {
+ recreateDbAndTables = false;
+ }
- String startingLogPeriodStr = parser.get("startingLogPeriod");
- Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
- startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
+ if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
+ processLaReferenciaLogs = true;
+ } else {
+ processLaReferenciaLogs = false;
+ }
- String endingLogPeriodStr = parser.get("endingLogPeriod");
- Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
- endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
+ if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
+ irusProcessStats = true;
+ } else {
+ irusProcessStats = false;
+ }
- numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
- numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
+ if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
+ sarcProcessStats = true;
+ } else {
+ sarcProcessStats = false;
+ }
- if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
- recreateDbAndTables = true;
- } else {
- recreateDbAndTables = false;
- }
+ if (parser.get("finalizeStats").toLowerCase().equals("true")) {
+ finalizeStats = true;
+ } else {
+ finalizeStats = false;
+ }
+ if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) {
+ finalTablesVisibleToImpala = true;
+ } else {
+ numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
+ }
- if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
- processLaReferenciaLogs = true;
- } else {
- processLaReferenciaLogs = false;
- }
+ UsageStatsExporter usagestatsExport = new UsageStatsExporter();
+ usagestatsExport.export();
+ }
- if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
- irusProcessStats = true;
- } else {
- irusProcessStats = false;
- }
+ private static Calendar startingLogPeriodStr(Date date) {
- irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(date);
+ return calendar;
- if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
- sarcProcessStats = true;
- } else {
- sarcProcessStats = false;
- }
- sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
-
- if (parser.get("finalizeStats").toLowerCase().equals("true")) {
- finalizeStats = true;
- } else {
- finalizeStats = false;
- }
- if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) {
- finalTablesVisibleToImpala = true;
- } else {
- numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
- }
-
- UsageStatsExporter usagestatsExport = new UsageStatsExporter();
- usagestatsExport.export();
- }
-
- private static Calendar startingLogPeriodStr(Date date) {
-
- Calendar calendar = Calendar.getInstance();
- calendar.setTime(date);
- return calendar;
-
- }
+ }
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java
index 4f34adc04..4439f848e 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;
@@ -27,45 +28,42 @@ import org.slf4j.LoggerFactory;
*/
public class IrusStats {
- private String irusUKURL;
+ private String irusUKURL;
- private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
+ private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
- public IrusStats() throws Exception {
- }
+ public IrusStats() throws Exception {
+ }
-
- public void processIrusStats() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ public void processIrusStats() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+ logger.info("Creating irus_downloads_stats_tmp table");
+ String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_downloads_stats_tmp "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`result_id` string, "
+ + "`date` string, "
+ + "`count` bigint, "
+ + "`openaire` bigint)";
+ stmt.executeUpdate(createDownloadsStats);
+ logger.info("Created irus_downloads_stats_tmp table");
- logger.info("Creating irus_downloads_stats_tmp table");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".irus_downloads_stats_tmp "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmt.executeUpdate(createDownloadsStats);
- logger.info("Created irus_downloads_stats_tmp table");
+ logger.info("Inserting into irus_downloads_stats_tmp");
+ String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp "
+ + "SELECT s.source, d.id AS repository_id, "
+ + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
+ + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
+ stmt.executeUpdate(insertDStats);
+ logger.info("Inserted into irus_downloads_stats_tmp");
- logger.info("Inserting into irus_downloads_stats_tmp");
- String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
- + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_oids ro "
- + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
- stmt.executeUpdate(insertDStats);
- logger.info("Inserted into irus_downloads_stats_tmp");
+ stmt.close();
+ // ConnectDB.getHiveConnection().close();
+ }
- stmt.close();
- //ConnectDB.getHiveConnection().close();
- }
-
-
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java
index ea3ac5948..0d34ebef3 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java
@@ -41,8 +41,6 @@ public class LaReferenciaStats {
public LaReferenciaStats() throws Exception {
}
-
-
public void processLogs() throws Exception {
try {
logger.info("LaReferencia creating viewsStats");
@@ -62,7 +60,6 @@ public class LaReferenciaStats {
}
}
-
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
@@ -101,7 +98,7 @@ public class LaReferenciaStats {
logger.info("Created la_views_stats_tmp table");
stmt.close();
- ConnectDB.getHiveConnection().close();
+ // ConnectDB.getHiveConnection().close();
}
private void downloadsStats() throws Exception {
@@ -142,8 +139,7 @@ public class LaReferenciaStats {
logger.info("Created la_downloads_stats_tmp table");
stmt.close();
- //ConnectDB.getHiveConnection().close();
+ // ConnectDB.getHiveConnection().close();
}
-
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java
index a165c6eab..253dc03b5 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java
@@ -1,22 +1,15 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export;
-import java.io.*;
-import java.net.URLDecoder;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
+import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.sql.Timestamp;
/**
* @author D. Pierrakos, S. Zoupanos
@@ -29,37 +22,51 @@ public class PiwikStatsDB {
private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
-
public PiwikStatsDB() throws Exception {
}
-
public void recreateDBAndTables() throws Exception {
this.createDatabase();
// The piwiklog table is not needed since it is built
// on top of JSON files
- ////////////this.createTmpTables();
+ //////////// this.createTmpTables();
}
private void createDatabase() throws Exception {
+
+// try {
+//
+// stmt = ConnectDB.getHiveConnection().createStatement();
+//
+// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
+// stmt.executeUpdate(dropDatabase);
+// } catch (Exception e) {
+// logger.error("Failed to drop database: " + e);
+// throw new Exception("Failed to drop database: " + e.toString(), e);
+// }
+//
try {
stmt = ConnectDB.getHiveConnection().createStatement();
+ logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+ logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema());
- logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
- String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
- stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
- logger.error("Failed to drop database: " + e);
- throw new Exception("Failed to drop database: " + e.toString(), e);
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
- logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
- String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
- stmt.executeUpdate(createDatabase);
+ logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
+ String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS "
+ + ConnectDB.getUsagestatsPermanentDBSchema();
+ stmt.executeUpdate(createPermanentDatabase);
+ logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
} catch (Exception e) {
logger.error("Failed to create database: " + e);
@@ -67,17 +74,16 @@ public class PiwikStatsDB {
}
}
-
public void processLogs() throws Exception {
try {
- logger.info("ViewsStats processing starts at: "+new Timestamp(System.currentTimeMillis()));
+ logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
viewsStats();
- logger.info("ViewsStats processing ends at: "+new Timestamp(System.currentTimeMillis()));
+ logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
- logger.info("DownloadsStats processing starts at: "+new Timestamp(System.currentTimeMillis()));
+ logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
downloadsStats();
- logger.info("DownloadsStats processing ends at: "+new Timestamp(System.currentTimeMillis()));
+ logger.info("DownloadsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
} catch (Exception e) {
logger.error("Failed to process logs: " + e);
@@ -85,68 +91,68 @@ public class PiwikStatsDB {
}
}
-
-
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_views_monthly_tmp view");
- String drop_result_views_monthly = "DROP VIEW IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".openaire_piwikresult_views_monthly_tmp";
+ String drop_result_views_monthly = "DROP VIEW IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".openaire_piwikresult_views_monthly_tmp";
stmt.executeUpdate(drop_result_views_monthly);
logger.info("Dropped openaire_result_views_monthly_tmp view");
logger.info("Creating openaire_result_views_monthly_tmp view");
String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
- + ".openaire_result_views_monthly_tmp " +
- "AS SELECT entity_id AS id, " +
- "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " +
- "AS openaire_referrer, " +
- "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
- "FROM " + ConnectDB.getUsageRawDataDBSchema()
- + ".piwiklog where action='action' and (source_item_type='oaItem' or " +
- "source_item_type='repItem') " +
- "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
- "source ORDER BY source, entity_id";
+ + ".openaire_result_views_monthly_tmp "
+ + "AS SELECT entity_id, "
+ + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
+ + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
+ + "AS openaire_referrer, "
+ + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema()
+ + ".piwiklog where action='action' and (source_item_type='oaItem' or "
+ + "source_item_type='repItem') "
+ + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
+ + "source ORDER BY source, entity_id";
stmt.executeUpdate(create_result_views_monthly);
logger.info("Created openaire_result_views_monthly_tmp table");
logger.info("Dropping openaire_views_stats_tmp table");
- String drop_views_stats = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".openaire_views_stats_tmp";
+ String drop_views_stats = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".openaire_views_stats_tmp";
stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_views_stats_tmp table");
logger.info("Creating openaire_views_stats_tmp table");
String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".openaire_views_stats_tmp " +
- "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " +
- "max(views) AS count, max(openaire_referrer) AS openaire " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " +
- ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
- "WHERE p.source=d.piwik_id AND p.id=ro.oid " +
- "GROUP BY d.id, ro.id, month " +
- "ORDER BY d.id, ro.id, month ";
+ + ".openaire_views_stats_tmp "
+ + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ + "max(views) AS count, max(openaire_referrer) AS openaire "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' "
+ + "GROUP BY d.id, ro.id, month "
+ + "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_views_stats);
logger.info("Created openaire_views_stats_tmp table");
logger.info("Creating openaire_pageviews_stats_tmp table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".openaire_pageviews_stats_tmp AS SELECT " +
- "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " +
- ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
- "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" +
- "GROUP BY d.id, ro.id, month " +
- "ORDER BY d.id, ro.id, month ";
+ + ".openaire_pageviews_stats_tmp AS SELECT "
+ + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID
+ + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' "
+ + "GROUP BY d.id, ro.id, month "
+ + "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table");
stmt.close();
- //ConnectDB.getHiveConnection().close();
+ // ConnectDB.getHiveConnection().close();
}
private void downloadsStats() throws Exception {
@@ -154,152 +160,315 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
- String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".openaire_result_downloads_monthly_tmp";
+ String drop_result_downloads_monthly = "DROP VIEW IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".openaire_result_downloads_monthly_tmp";
stmt.executeUpdate(drop_result_downloads_monthly);
logger.info("Dropped openaire_result_downloads_monthly_tmp view");
logger.info("Creating openaire_result_downloads_monthly_tmp view");
- String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " +
- "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " +
- "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " +
- "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
- "FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " +
- "AND (source_item_type='oaItem' OR source_item_type='repItem') " +
- "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " +
- "ORDER BY source, entity_id, month";
+ String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ + ".openaire_result_downloads_monthly_tmp "
+ + "AS SELECT entity_id, "
+ + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
+ + "COUNT(entity_id) as downloads, "
+ + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' "
+ + "AND (source_item_type='oaItem' OR source_item_type='repItem') "
+ + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source "
+ + "ORDER BY source, entity_id, month";
stmt.executeUpdate(sql);
logger.info("Created openaire_result_downloads_monthly_tmp view");
logger.info("Dropping openaire_downloads_stats_tmp table");
- String drop_views_stats = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".openaire_downloads_stats_tmp";
+ String drop_views_stats = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".openaire_downloads_stats_tmp";
stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_downloads_stats_tmp table");
logger.info("Creating openaire_downloads_stats_tmp table");
- sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " +
- "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " +
- "max(downloads) AS count, max(openaire_referrer) AS openaire " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " +
- ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
- "WHERE p.source=d.piwik_id and p.id=ro.oid " +
- "GROUP BY d.id, ro.id, month " +
- "ORDER BY d.id, ro.id, month ";
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS "
+ + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ + "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' "
+ + "GROUP BY d.id, ro.id, month "
+ + "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(sql);
logger.info("Created downloads_stats table");
-
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
- sql = "DROP VIEW IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp";
+ sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp";
logger.info("Dropped openaire_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
stmt.close();
- //ConnectDB.getHiveConnection().close();
+ // ConnectDB.getHiveConnection().close();
+ }
+
+ public void uploadOldPedocs() throws Exception {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ // Dropping Pedocs pedocs_views_stats_tmp table
+ logger.info("Dropping Pedocs pedocs_views_stats_tmp table");
+ String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
+ logger.info("Dropped pedocs_views_stats_tmp table ");
+ stmt.executeUpdate(sql);
+
+ // Dropping Pedocs pedocs_downloads_stats table
+ logger.info("Dropping pedocs_downloads_stats table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats";
+ logger.info("Dropped pedocs_downloads_stats table ");
+ stmt.executeUpdate(sql);
+
+ // Creating Pedocs pedocs_views_stats_tmp table
+ logger.info("Creating Pedocs pedocs_views_stats_tmp table");
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS "
+ + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ + "r.id as result_id,date,counter_abstract as count, 0 as openaire "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema()
+ + ".result_oids r where r.oid=p.identifier";
+ stmt.executeUpdate(sql);
+ logger.info("Created pedocs_views_stats_tmp table ");
+
+ // Creating Pedocs pedocs_downloads_stats_tmp table
+ logger.info("Creating Pedocs pedocs_downloads_stats_tmp table");
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS "
+ + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ + "r.id as result_id, date, counter as count, 0 as openaire "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema()
+ + ".result_oids r where r.oid=p.identifier";
+ stmt.executeUpdate(sql);
+ logger.info("Created pedocs_downloads_stats_tmp table ");
+
+ }
+
+ public void uploadTUDELFTStats() throws Exception {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ // Dropping TUDELFT tudelft_result_views_monthly_tmp view
+ logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
+ String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
+ logger.info("Dropped tudelft_result_views_monthly_tmp view ");
+ stmt.executeUpdate(sql);
+
+ // Dropping TUDELFT tudelft_result_views_monthly_tmp view
+ logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
+ sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
+ logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
+ stmt.executeUpdate(sql);
+
+ // Dropping TUDELFT tudelft_views_stats_tmp table
+ logger.info("Dropping TUDELFT tudelft_views_stats_tmp table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
+ logger.info("Dropped tudelft_views_stats_tmp table ");
+ stmt.executeUpdate(sql);
+
+ // Dropping TUDELFT tudelft_downloads_stats_tmp table
+ logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
+ logger.info("Dropped tudelft_downloads_stats_tmp table ");
+ stmt.executeUpdate(sql);
+
+ // Creating TUDELFT tudelft_result_views_monthly_tmp view
+ logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view");
+ sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp "
+ + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
+ stmt.executeUpdate(sql);
+ logger.info("Created tudelft_result_views_monthly_tmp view ");
+
+ // Creating TUDELFT tudelft_views_stats_tmp table
+ logger.info("Creating TUDELFT tudelft_views_stats_tmp table");
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS "
+ + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".tudelft_result_views_monthly_tmp p, "
+ + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
+ stmt.executeUpdate(sql);
+ logger.info("Created TUDELFT tudelft_views_stats_tmp table");
+
+ // Creating TUDELFT tudelft_result_downloads_monthly_tmp view
+ logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view");
+ sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp "
+ + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
+ stmt.executeUpdate(sql);
+ logger.info("Created tudelft_result_downloads_monthly_tmp view ");
+
+ // Creating TUDELFT tudelft_downloads_stats_tmp table
+ logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table");
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS "
+ + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".tudelft_result_downloads_monthly_tmp p, "
+ + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
+ stmt.executeUpdate(sql);
+ logger.info("Created TUDELFT tudelft_downloads_stats_tmp table");
+
+ // Dropping TUDELFT tudelft_result_views_monthly_tmp view
+ logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
+ sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
+ logger.info("Dropped tudelft_result_views_monthly_tmp view ");
+ stmt.executeUpdate(sql);
+
+ // Dropping TUDELFT tudelft_result_views_monthly_tmp view
+ logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
+ sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
+ logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
+ stmt.executeUpdate(sql);
+
}
public void finalizeStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
- //Dropping views_stats table
- logger.info("Dropping views_stats table");
- String sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".views_stats";
+ // Dropping views_stats table
+ logger.info("Dropping views_stats table");
+ String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
logger.info("Dropped views_stats table ");
stmt.executeUpdate(sql);
- //Dropping downloads_stats table
- logger.info("Dropping downloads_stats table");
- sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
+ // Dropping downloads_stats table
+ logger.info("Dropping downloads_stats table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
logger.info("Dropped downloads_stats table ");
stmt.executeUpdate(sql);
- //Dropping page_views_stats table
- logger.info("Dropping pageviews_stats table");
- sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
+ // Dropping page_views_stats table
+ logger.info("Dropping pageviews_stats table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
logger.info("Dropped pageviews_stats table ");
stmt.executeUpdate(sql);
- //Creating views_stats table
+ // Dropping usage_stats table
+ logger.info("Dropping usage_stats table");
+ sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
+ logger.info("Dropped usage_stats table ");
+ stmt.executeUpdate(sql);
+
+ // Creating views_stats table
logger.info("Creating views_stats table");
- String createViewsStats = "CREATE TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".views_stats " +
- "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET";
+ String createViewsStats = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".views_stats "
+ + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createViewsStats);
logger.info("Created views_stats table");
-
- //Inserting OpenAIRE views stats
- logger.info("Inserting Openaire data to views_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
- stmt.executeUpdate(sql);
- logger.info("Openaire views updated to views_stats");
- //Inserting Lareferencia views stats
- logger.info("Inserting LaReferencia data to views_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
+ // Inserting OpenAIRE views stats
+ logger.info("Inserting Openaire data to views_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
stmt.executeUpdate(sql);
- logger.info("LaReferencia views updated to views_stats");
-
+ logger.info("Openaire views updated to views_stats");
+
+ // Inserting Pedocs old views stats
+ logger.info("Inserting Pedocs old data to views_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Pedocs views updated to views_stats");
+
+ // Inserting TUDELFT views stats
+ logger.info("Inserting TUDELFT data to views_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("TUDELFT views updated to views_stats");
+
+ // Inserting Lareferencia views stats
+ logger.info("Inserting LaReferencia data to views_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("LaReferencia views updated to views_stats");
logger.info("Creating downloads_stats table");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".downloads_stats " +
- "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET";
+ String createDownloadsStats = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".downloads_stats "
+ + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats table");
- //Inserting OpenAIRE downloads stats
+ // Inserting OpenAIRE downloads stats
logger.info("Inserting OpenAIRE data to downloads_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp";
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp";
stmt.executeUpdate(sql);
+ logger.info("Inserted OpenAIRE data to downloads_stats");
- //Inserting Lareferencia downloads stats
+ // Inserting Pedocs old downloads stats
+ logger.info("Inserting PeDocs old data to downloads_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Inserted Pedocs data to downloads_stats");
+
+ // Inserting TUDELFT downloads stats
+ logger.info("Inserting TUDELFT old data to downloads_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Inserted TUDELFT data to downloads_stats");
+
+ // Inserting Lareferencia downloads stats
logger.info("Inserting LaReferencia data to downloads_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
- logger.info("Lareferencia downloads updated to downloads_stats");
-
- //Inserting IRUS downloads stats
- logger.info("Inserting IRUS data to downloads_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
- stmt.executeUpdate(sql);
- logger.info("IRUS downloads updated to downloads_stats");
+ logger.info("Lareferencia downloads updated to downloads_stats");
- //Inserting SARC-OJS downloads stats
- logger.info("Inserting SARC data to downloads_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp";
+ // Inserting IRUS downloads stats
+ logger.info("Inserting IRUS data to downloads_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
stmt.executeUpdate(sql);
- logger.info("SARC-OJS downloads updated to downloads_stats");
-
-
- logger.info("Creating pageviews_stats table");
+ logger.info("IRUS downloads updated to downloads_stats");
+
+ // Inserting SARC-OJS downloads stats
+ logger.info("Inserting SARC data to downloads_stats");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("SARC-OJS downloads updated to downloads_stats");
+
+ logger.info("Creating pageviews_stats table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".pageviews_stats " +
- "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET";
+ + ".pageviews_stats "
+ + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table");
-
- //Inserting OpenAIRE views stats from Portal
+
+ // Inserting OpenAIRE views stats from Portal
logger.info("Inserting data to page_views_stats");
- sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " +
- "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
stmt.executeUpdate(sql);
-
+
logger.info("Dropping full_dates table");
- String dropFullDates = "DROP TABLE IF EXISTS " +
- ConnectDB.getUsageStatsDBSchema() +
- ".full_dates";
+ String dropFullDates = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".full_dates";
stmt.executeUpdate(dropFullDates);
logger.info("Dropped full_dates table");
@@ -310,35 +479,80 @@ public class PiwikStatsDB {
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
logger.info("Creating full_dates table");
- sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " +
- "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " +
- "FROM (SELECT DATE '2016-01-01' AS from_date) p " +
- "LATERAL VIEW " +
- "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS "
+ + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date "
+ + "FROM (SELECT DATE '2016-01-01' AS from_date) p "
+ + "LATERAL VIEW "
+ + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
stmt.executeUpdate(sql);
logger.info("Created full_dates table");
-
- logger.info("Inserting data to usage_stats");
- sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " +
- "SELECT coalesce(ds.source, vs.source) as source, " +
- "coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
- "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
- "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
- "coalesce(ds.openaire, 0) as openaire_downloads, " +
- "coalesce(vs.openaire, 0) as openaire_views " +
- "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
- ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
- "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
- stmt.executeUpdate(sql);
- logger.info("Inserted data to usage_stats");
+ logger.info("Inserting data to usage_stats");
+ sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS "
+ + "SELECT coalesce(ds.source, vs.source) as source, "
+ + "coalesce(ds.repository_id, vs.repository_id) as repository_id, "
+ + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
+ + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, "
+ + "coalesce(ds.openaire, 0) as openaire_downloads, "
+ + "coalesce(vs.openaire, 0) as openaire_views "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN "
+ + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source "
+ + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
+ stmt.executeUpdate(sql);
+ logger.info("Inserted data to usage_stats");
+ logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis()));
+
+ logger.info("Dropping view views_stats on permanent usagestats DB");
+ sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped view views_stats on permanent usagestats DB");
+
+ logger.info("Create view views_stats on permanent usagestats DB");
+ sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"
+ + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Created view views_stats on permanent usagestats DB");
+
+ logger.info("Dropping view pageviews_stats on permanent usagestats DB");
+ sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped view pageviews_stats on permanent usagestats DB");
+
+ logger.info("Create view pageviews_stats on permanent usagestats DB");
+ sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"
+ + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Created view pageviews_stats on permanent usagestats DB");
+
+ logger.info("Dropping view downloads_stats on permanent usagestats DB");
+ sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped view on downloads_stats on permanent usagestats DB");
+
+ logger.info("Create view on downloads_stats on permanent usagestats DB");
+ sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"
+ + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Created view on downloads_stats on permanent usagestats DB");
+
+ logger.info("Dropping view usage_stats on permanent usagestats DB");
+ sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped view on usage_stats on permanent usagestats DB");
+
+ logger.info("Create view on usage_stats on permanent usagestats DB");
+ sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"
+ + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+ logger.info("Created view on usage_stats on permanent usagestats DB");
+
+ logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis()));
stmt.close();
ConnectDB.getHiveConnection().close();
}
-
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java
index 2d224075f..880233f00 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;
@@ -33,74 +34,74 @@ import org.slf4j.LoggerFactory;
*/
public class SarcStats {
- private Statement stmtHive = null;
- private Statement stmtImpala = null;
+ private Statement stmtHive = null;
+ private Statement stmtImpala = null;
- private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
+ private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
- public SarcStats() throws Exception {
+ public SarcStats() throws Exception {
// createTables();
- }
+ }
- private void createTables() throws Exception {
- try {
+ private void createTables() throws Exception {
+ try {
- stmtHive = ConnectDB.getHiveConnection().createStatement();
- String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
- stmtHive.executeUpdate(sqlCreateTableSushiLog);
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
+ stmtHive.executeUpdate(sqlCreateTableSushiLog);
- // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
- // stmt.executeUpdate(sqlCopyPublicSushiLog);
- String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
- + " ON INSERT TO sushilog "
- + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
- + "sushilog.rid, sushilog.date "
- + "FROM sushilog "
- + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
- stmtHive.executeUpdate(sqlcreateRuleSushiLog);
- String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
- stmtHive.executeUpdate(createSushiIndex);
+ // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
+ // stmt.executeUpdate(sqlCopyPublicSushiLog);
+ String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ + " ON INSERT TO sushilog "
+ + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ + "sushilog.rid, sushilog.date "
+ + "FROM sushilog "
+ + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+ stmtHive.executeUpdate(sqlcreateRuleSushiLog);
+ String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
+ stmtHive.executeUpdate(createSushiIndex);
- stmtHive.close();
- ConnectDB.getHiveConnection().close();
- logger.info("Sushi Tables Created");
- } catch (Exception e) {
- logger.error("Failed to create tables: " + e);
- throw new Exception("Failed to create tables: " + e.toString(), e);
- }
- }
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
- public void processSarc() throws Exception {
- Statement stmt = ConnectDB.getHiveConnection().createStatement();
- ConnectDB.getHiveConnection().setAutoCommit(false);
+ public void processSarc() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
- logger.info("Creating sarc_downloads_stats_tmp table");
- String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
- + ".sarc_downloads_stats_tmp "
- + "(`source` string, "
- + "`repository_id` string, "
- + "`result_id` string, "
- + "`date` string, "
- + "`count` bigint, "
- + "`openaire` bigint)";
- stmt.executeUpdate(createDownloadsStats);
- logger.info("Created sarc_downloads_stats_tmp table");
+ logger.info("Creating sarc_downloads_stats_tmp table");
+ String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_downloads_stats_tmp "
+ + "(`source` string, "
+ + "`repository_id` string, "
+ + "`result_id` string, "
+ + "`date` string, "
+ + "`count` bigint, "
+ + "`openaire` bigint)";
+ stmt.executeUpdate(createDownloadsStats);
+ logger.info("Created sarc_downloads_stats_tmp table");
- logger.info("Inserting into sarc_downloads_stats_tmp");
- String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp "
- + "SELECT s.source, d.id AS repository_id, "
- + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
- + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
- + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
- + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
- + ConnectDB.getStatsDBSchema() + ".result_pids ro "
- + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
- + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
- stmt.executeUpdate(insertSarcStats);
- logger.info("Inserted into sarc_downloads_stats_tmp");
-
- stmt.close();
- //ConnectDB.getHiveConnection().close();
- }
+ logger.info("Inserting into sarc_downloads_stats_tmp");
+ String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp "
+ + "SELECT s.source, d.id AS repository_id, "
+ + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
+ + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
+ + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ + ConnectDB.getStatsDBSchema() + ".result_pids ro "
+ + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
+ stmt.executeUpdate(insertSarcStats);
+ logger.info("Inserted into sarc_downloads_stats_tmp");
+
+ stmt.close();
+ // ConnectDB.getHiveConnection().close();
+ }
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java
index 43abb1681..47986f52a 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.IOException;
@@ -17,90 +18,110 @@ import org.slf4j.LoggerFactory;
*/
public class UsageStatsExporter {
- public UsageStatsExporter() {
+ public UsageStatsExporter() {
- }
+ }
- private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
- public void export() throws Exception {
+ public void export() throws Exception {
- logger.info("Initialising DB properties");
- ConnectDB.init();
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
// runImpalaQuery();
- PiwikStatsDB piwikstatsdb = new PiwikStatsDB();
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB();
- logger.info("Re-creating database and tables");
- if (ExecuteWorkflow.recreateDbAndTables) {
- piwikstatsdb.recreateDBAndTables();
- logger.info("DB-Tables are created ");
- }
+ logger.info("Re-creating database and tables");
+ if (ExecuteWorkflow.recreateDbAndTables) {
+ piwikstatsdb.recreateDBAndTables();
+ logger.info("DB-Tables are created ");
+ }
// else {
// piwikstatsdb.createTmpTables();
// logger.info("TmpTables are created ");
// }
- if (ExecuteWorkflow.processPiwikLogs) {
- logger.info("Processing logs");
- piwikstatsdb.processLogs();
- }
+ if (ExecuteWorkflow.processPiwikLogs) {
+ logger.info("Processing Piwik logs");
+ piwikstatsdb.processLogs();
+ logger.info("Piwik logs Done");
+ logger.info("Processing Pedocs Old Stats");
+ piwikstatsdb.uploadOldPedocs();
+ logger.info("Processing Pedocs Old Stats Done");
+ logger.info("Processing TUDELFT Stats");
+ piwikstatsdb.uploadTUDELFTStats();
+ logger.info("Processing TUDELFT Stats Done");
- LaReferenciaStats lastats = new LaReferenciaStats();
+ }
- if (ExecuteWorkflow.processLaReferenciaLogs) {
- logger.info("Processing LaReferencia logs");
- lastats.processLogs();
- logger.info("LaReferencia logs done");
- }
-
- IrusStats irusstats = new IrusStats();
-
- if (ExecuteWorkflow.irusProcessStats) {
- logger.info("Processing IRUS");
- irusstats.processIrusStats();
- logger.info("Irus done");
- }
+ LaReferenciaStats lastats = new LaReferenciaStats();
- SarcStats sarcStats = new SarcStats();
+ if (ExecuteWorkflow.processLaReferenciaLogs) {
+ logger.info("Processing LaReferencia logs");
+ lastats.processLogs();
+ logger.info("LaReferencia logs done");
+ }
- if (ExecuteWorkflow.sarcProcessStats) {
- sarcStats.processSarc();
- }
- logger.info("Sarc done");
+ IrusStats irusstats = new IrusStats();
- // finalize usagestats
- if (ExecuteWorkflow.finalizeStats) {
- piwikstatsdb.finalizeStats();
- logger.info("Finalized stats");
- }
+ if (ExecuteWorkflow.irusProcessStats) {
+ logger.info("Processing IRUS");
+ irusstats.processIrusStats();
+ logger.info("Irus done");
+ }
- // Make the tables available to Impala
- if (ExecuteWorkflow.finalTablesVisibleToImpala) {
- logger.info("Making tables visible to Impala");
- invalidateMetadata();
- }
+ SarcStats sarcStats = new SarcStats();
- logger.info("End");
- }
+ if (ExecuteWorkflow.sarcProcessStats) {
+ sarcStats.processSarc();
+ }
+ logger.info("Sarc done");
- private void invalidateMetadata() throws SQLException {
- Statement stmt = null;
+ // finalize usagestats
+ if (ExecuteWorkflow.finalizeStats) {
+ piwikstatsdb.finalizeStats();
+ logger.info("Finalized stats");
+ }
- stmt = ConnectDB.getImpalaConnection().createStatement();
+ // Make the tables available to Impala
+ if (ExecuteWorkflow.finalTablesVisibleToImpala) {
+ logger.info("Making tables visible to Impala");
+ invalidateMetadata();
+ }
- String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
- stmt.executeUpdate(sql);
+ logger.info("End");
+ }
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
- stmt.executeUpdate(sql);
+ private void invalidateMetadata() throws SQLException {
+ Statement stmt = null;
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
- stmt.executeUpdate(sql);
+ stmt = ConnectDB.getImpalaConnection().createStatement();
- sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
- stmt.executeUpdate(sql);
+ String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
- stmt.close();
- ConnectDB.getHiveConnection().close();
- }
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
+ stmt.executeUpdate(sql);
+
+ sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
+ stmt.executeUpdate(sql);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ }
}
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json
index 3f121288e..407370ada 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json
@@ -1,237 +1,128 @@
[
- {
- "paramName": "mat",
- "paramLongName": "matomoAuthToken",
- "paramDescription": "when true will stop SparkSession after job execution",
- "paramRequired": false
- },
- {
- "paramName": "mbu",
- "paramLongName": "matomoBaseURL",
- "paramDescription": "URL of the isLookUp Service",
- "paramRequired": true
- },
- {
- "paramName": "rlp",
- "paramLongName": "repoLogPath",
- "paramDescription": "nameNode of the source cluster",
- "paramRequired": true
- },
- {
- "paramName": "plp",
- "paramLongName": "portalLogPath",
- "paramDescription": "namoNode of the target cluster",
- "paramRequired": true
- },
- {
- "paramName": "pmi",
- "paramLongName": "portalMatomoID",
- "paramDescription": "namoNode of the target cluster",
- "paramRequired": true
- },
- {
- "paramName": "iukbuw",
- "paramLongName": "irusUKBaseURL",
- "paramDescription": "working directory",
- "paramRequired": true
- },
- {
- "paramName": "iukrp",
- "paramLongName": "irusUKReportPath",
- "paramDescription": "maximum number of map tasks used in the distcp process",
- "paramRequired": true
- },
- {
- "paramName": "srpa",
- "paramLongName": "sarcsReportPathArray",
- "paramDescription": "memory for distcp action copying actionsets from remote cluster",
- "paramRequired": true
- },
- {
- "paramName": "srpna",
- "paramLongName": "sarcsReportPathNonArray",
- "paramDescription": "timeout for distcp copying actions from remote cluster",
- "paramRequired": true
- },
- {
- "paramName": "llp",
- "paramLongName": "lareferenciaLogPath",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "lbu",
- "paramLongName": "lareferenciaBaseURL",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "lat",
- "paramLongName": "lareferenciaAuthToken",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dbhu",
- "paramLongName": "dbHiveUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "dbiu",
- "paramLongName": "dbImpalaUrl",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "urdbs",
- "paramLongName": "usageRawDataDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
{
- "paramName": "usdbs",
- "paramLongName": "usageStatsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "sdbs",
- "paramLongName": "statsDBSchema",
- "paramDescription": "activate tranform-only mode. Only apply transformation step",
- "paramRequired": true
- },
- {
- "paramName": "rdbt",
- "paramLongName": "recreateDbAndTables",
- "paramDescription": "Re-create database and initial tables?",
- "paramRequired": true
- },
- {
- "paramName": "pwed",
- "paramLongName": "piwikEmptyDirs",
- "paramDescription": "Empty piwik directories?",
- "paramRequired": true
- },
- {
- "paramName": "ppwl",
- "paramLongName": "processPiwikLogs",
- "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
- "paramRequired": true
- },
- {
- "paramName": "dpwl",
- "paramLongName": "downloadPiwikLogs",
- "paramDescription": "download piwik logs?",
- "paramRequired": true
- },
- {
- "paramName": "slp",
- "paramLongName": "startingLogPeriod",
- "paramDescription": "Starting log period",
- "paramRequired": true
- },
- {
- "paramName": "elp",
- "paramLongName": "endingLogPeriod",
- "paramDescription": "Ending log period",
- "paramRequired": true
- },
- {
- "paramName": "npidd",
- "paramLongName": "numberOfPiwikIdsToDownload",
- "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "nsidd",
- "paramLongName": "numberOfSiteIdsToDownload",
- "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "lerd",
- "paramLongName": "laReferenciaEmptyDirs",
- "paramDescription": "Empty LaReferencia directories?",
- "paramRequired": true
- },
- {
- "paramName": "plrl",
- "paramLongName": "processLaReferenciaLogs",
- "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
- "paramRequired": true
- },
- {
- "paramName": "dlrl",
- "paramLongName": "downloadLaReferenciaLogs",
- "paramDescription": "download La Referencia logs?",
- "paramRequired": true
- },
- {
- "paramName": "icted",
- "paramLongName": "irusCreateTablesEmptyDirs",
- "paramDescription": "Irus section: Create tables and empty JSON directories?",
- "paramRequired": true
- },
- {
- "paramName": "idr",
- "paramLongName": "irusDownloadReports",
- "paramDescription": "Irus section: Download reports?",
- "paramRequired": true
- },
- {
- "paramName": "ipr",
- "paramLongName": "irusProcessStats",
- "paramDescription": "Irus section: Process stats?",
- "paramRequired": true
- },
- {
- "paramName": "inod",
- "paramLongName": "irusNumberOfOpendoarsToDownload",
- "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload",
- "paramRequired": true
- },
- {
- "paramName": "icted",
- "paramLongName": "sarcCreateTablesEmptyDirs",
- "paramDescription": "Sarc section: Create tables and empty JSON directories?",
- "paramRequired": true
- },
- {
- "paramName": "idr",
- "paramLongName": "sarcDownloadReports",
- "paramDescription": "Sarc section: Download reports?",
- "paramRequired": true
- },
- {
- "paramName": "ipr",
- "paramLongName": "sarcProcessStats",
- "paramDescription": "Sarc section: Process stats?",
- "paramRequired": true
- },
- {
- "paramName": "inod",
- "paramLongName": "sarcNumberOfIssnToDownload",
- "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
- "paramRequired": true
- },
-
- {
- "paramName": "fs",
- "paramLongName": "finalizeStats",
- "paramDescription": "Create the usage_stats table?",
- "paramRequired": true
- },
- {
- "paramName": "ftvi",
- "paramLongName": "finalTablesVisibleToImpala",
- "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
- "paramRequired": true
- },
- {
- "paramName": "nodt",
- "paramLongName": "numberOfDownloadThreads",
- "paramDescription": "Number of download threads",
- "paramRequired": true
- }
+ "paramName": "rlp",
+ "paramLongName": "repoLogPath",
+ "paramDescription": "nameNode of the source cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plp",
+ "paramLongName": "portalLogPath",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pmi",
+ "paramLongName": "portalMatomoID",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "iukrp",
+ "paramLongName": "irusUKReportPath",
+ "paramDescription": "maximum number of map tasks used in the distcp process",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpa",
+ "paramLongName": "sarcsReportPathArray",
+ "paramDescription": "memory for distcp action copying actionsets from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpna",
+ "paramLongName": "sarcsReportPathNonArray",
+ "paramDescription": "timeout for distcp copying actions from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "llp",
+ "paramLongName": "lareferenciaLogPath",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbhu",
+ "paramLongName": "dbHiveUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbiu",
+ "paramLongName": "dbImpalaUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "urdbs",
+ "paramLongName": "usageRawDataDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "usdbs",
+ "paramLongName": "usageStatsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "sdbs",
+ "paramLongName": "statsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "uspdbs",
+ "paramLongName": "usagestatsPermanentDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rdbt",
+ "paramLongName": "recreateDbAndTables",
+ "paramDescription": "Re-create database and initial tables?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ppwl",
+ "paramLongName": "processPiwikLogs",
+ "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plrl",
+ "paramLongName": "processLaReferenciaLogs",
+ "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "irusProcessStats",
+ "paramDescription": "Irus section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "sarcProcessStats",
+ "paramDescription": "Sarc section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "fs",
+ "paramLongName": "finalizeStats",
+ "paramDescription": "Create the usage_stats table?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ftvi",
+ "paramLongName": "finalTablesVisibleToImpala",
+ "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nodt",
+ "paramLongName": "numberOfDownloadThreads",
+ "paramDescription": "Number of download threads",
+ "paramRequired": true
+ }
]
diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml
index 37700539b..71e8a50d6 100644
--- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml
@@ -42,42 +42,24 @@
eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow
- --matomoAuthToken${matomoAuthToken}
- --matomoBaseURL${matomoBaseURL}
--repoLogPath${repoLogPath}
--portalLogPath${portalLogPath}
--portalMatomoID${portalMatomoID}
- --irusUKBaseURL${irusUKBaseURL}
--irusUKReportPath${irusUKReportPath}
--sarcsReportPathArray${sarcsReportPathArray}
--sarcsReportPathNonArray${sarcsReportPathNonArray}
--lareferenciaLogPath${lareferenciaLogPath}
- --lareferenciaBaseURL${lareferenciaBaseURL}
- --lareferenciaAuthToken${lareferenciaAuthToken}
--dbHiveUrl${hiveJdbcUrl}
--dbImpalaUrl${impalaJdbcUrl}
--usageRawDataDBSchema${usageRawDataDBSchema}
--usageStatsDBSchema${usageStatsDBSchema}
+ --usagestatsPermanentDBSchema${usagestatsPermanentDBSchema}
--statsDBSchema${statsDBSchema}
--recreateDbAndTables${recreateDbAndTables}
- --piwikEmptyDirs${piwikEmptyDirs}
- --downloadPiwikLogs${downloadPiwikLogs}
--processPiwikLogs${processPiwikLogs}
- --startingLogPeriod${startingLogPeriod}
- --endingLogPeriod${endingLogPeriod}
- --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload}
- --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload}
- --laReferenciaEmptyDirs${laReferenciaEmptyDirs}
- --downloadLaReferenciaLogs${downloadLaReferenciaLogs}
--processLaReferenciaLogs${processLaReferenciaLogs}
- --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs}
- --irusDownloadReports${irusDownloadReports}
--irusProcessStats${irusProcessStats}
- --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload}
- --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs}
- --sarcDownloadReports${sarcDownloadReports}
--sarcProcessStats${sarcProcessStats}
- --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
--finalizeStats${finalizeStats}
--finalTablesVisibleToImpala${finalTablesVisibleToImpala}
--numberOfDownloadThreads${numberOfDownloadThreads}