Commit 12122020

This commit is contained in:
Dimitris 2020-12-12 12:00:14 +02:00
parent bbcf6b7c8b
commit dc9c2f3272
33 changed files with 3306 additions and 3022 deletions

View File

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<project-shared-configuration>
<!--
This file contains additional configuration written by modules in the NetBeans IDE.
The configuration is intended to be shared among all the users of project and
therefore it is assumed to be part of version control checkout.
Without this configuration present, some functionality in the IDE may be limited or fail altogether.
-->
<properties xmlns="http://www.netbeans.org/ns/maven-properties-data/1">
<!--
Properties that influence various parts of the IDE, especially code formatting and the like.
You can copy and paste the single properties, into the pom.xml file and the IDE will pick them up.
That way multiple projects can share the same settings (useful for formatting rules for example).
Any value defined here will override the pom.xml file value but is only applicable to the current project.
-->
<netbeans.hint.jdkPlatform>JDK_1.8</netbeans.hint.jdkPlatform>
</properties>
</project-shared-configuration>

View File

@ -23,7 +23,35 @@
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-datasets-stats-update</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
@ -68,6 +96,11 @@
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.mchange</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.5.2</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats

View File

@ -79,6 +79,7 @@ public abstract class ConnectDB {
*/
ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbHiveUrl);
cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1);
@ -107,6 +108,7 @@ public abstract class ConnectDB {
*/
ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbImpalaUrl);
cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1);
@ -126,77 +128,4 @@ public abstract class ConnectDB {
return cpds.getConnection();
}
private void createDatabase() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Piwiklog table - This table should exist
String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) "
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLog);
/////////////////////////////////////////
// Rule for duplicate inserts @ piwiklog
/////////////////////////////////////////
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePortalLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log
//////////////////////////////////////////////////
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
}
/*
CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING,
name STRING,
source STRING,
release STRING,
createdby STRING,
report_end_date STRING,
report_start_date STRING)
CLUSTERED BY (reportid)
into 100 buckets stored as orc tblproperties('transactional'='true');
*/

View File

@ -0,0 +1,168 @@
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author D. Pierrakos, S. Zoupanos
*/
public class DatasetsStatsDB {
private String logPath;
private String logRepoPath;
private String logPortalPath;
private Statement stmt = null;
private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class);
private String CounterRobotsURL;
private ArrayList robotsList;
public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception {
this.logRepoPath = logRepoPath;
this.logPortalPath = logPortalPath;
}
public void recreateDBAndTables() throws Exception {
this.createDatabase();
this.createTables();
}
// public void reCreateLogDirs() throws IllegalArgumentException, IOException {
// FileSystem dfs = FileSystem.get(new Configuration());
//
// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
//
// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
//
// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
//
// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
// }
public ArrayList getRobotsList() {
return robotsList;
}
public void setRobotsList(ArrayList robotsList) {
this.robotsList = robotsList;
}
public String getCounterRobotsURL() {
return CounterRobotsURL;
}
public void setCounterRobotsURL(String CounterRobotsURL) {
this.CounterRobotsURL = CounterRobotsURL;
}
private void createDatabase() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Reports table - This table should exist
logger.info("Creating Reports Table");
String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports(reportid STRING, \n"
+ " name STRING, \n"
+ " source STRING,\n"
+ " release STRING,\n"
+ " createdby STRING,\n"
+ " report_start_date STRING,\n"
+ " report_end_date STRING)\n"
+ " CLUSTERED BY (reportid)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataciteReports);
logger.info("Reports Table Created");
// Create Datasets Table
logger.info("Creating DataSets Table");
String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datasets(ds_type STRING,\n"
+ " ds_title STRING,\n"
+ " yop STRING,\n"
+ " uri STRING,\n"
+ " platform STRING,\n"
+ " data_type STRING,\n"
+ " publisher STRING,\n"
+ " publisher_id_type STRING,\n"
+ " publisher_id_value STRING,\n"
+ " ds_dates_type STRING,\n"
+ " ds_pub_date STRING,\n"
+ " ds_contributors STRING,\n"
// + " ds_contributor_value array <STRING>,\n"
+ " reportid STRING)\n"
+ " CLUSTERED BY (ds_type)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataSets);
logger.info("DataSets Table Created");
// Create Datasets Performance Table
logger.info("Creating DataSetsPerformance Table");
String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datasetsperformance(ds_type STRING,\n"
+ " period_end STRING,\n"
+ " period_from STRING,\n"
+ " access_method STRING,\n"
+ " metric_type STRING,\n"
+ " count INT,\n"
+ " country_counts STRING,\n"
+ " reportid STRING)\n"
+ " CLUSTERED BY (ds_type)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataSetsPerformance);
logger.info("DataSetsPerformance Table Created");
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}
}

View File

@ -3,20 +3,18 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import com.google.gson.JsonObject;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
@ -25,8 +23,12 @@ import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
/**
*
* @author dpie
*/
public class DownloadReportsListFromDatacite {
@ -35,7 +37,8 @@ public class DownloadReportsListFromDatacite {
private String dataciteReportPath;
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception {
public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath)
throws MalformedURLException, Exception {
this.dataciteBaseURL = dataciteBaseURL;
this.dataciteReportPath = dataciteReportPath;
@ -80,7 +83,9 @@ public class DownloadReportsListFromDatacite {
responseStrBuilder2.append(inputStr);
}
FileSystem fs = FileSystem.get(new Configuration());
FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"),
FSDataOutputStream fin = fs
.create(
new Path(dataciteReportPath + "/" + reportId + ".json"),
true);
byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
fin.write(jsonObjectRawBytes);

View File

@ -26,7 +26,6 @@ public class ExecuteWorkflow {
static boolean datasetsEmptyDirs;
static boolean finalTablesVisibleToImpala;
public static void main(String args[]) throws Exception {
// Sending the logs to the console
@ -58,11 +57,11 @@ public class ExecuteWorkflow {
else
datasetsEmptyDirs = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
finalTablesVisibleToImpala = true;
else
finalTablesVisibleToImpala = false;
// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
// finalTablesVisibleToImpala = true;
// else
// finalTablesVisibleToImpala = false;
//
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
}

View File

@ -0,0 +1,408 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.sql.Array;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
/**
* @author dpie
*/
public class ReadReportsListFromDatacite {
private String dataciteReportPath;
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception {
this.dataciteReportPath = dataciteReportPath;
}
public void readReports() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
File folder = new File(dataciteReportPath);
ArrayList<String> jsonFiles = listHdfsDir(dataciteReportPath);
for (String jsonFile : jsonFiles) {
logger.info("Reading report file " + jsonFile);
this.createTmpReportsTable(jsonFile);
String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelectReportID);
ResultSet rstmpReportID = stmt.getResultSet();
String reportID = null;
while (rstmpReportID.next()) {
reportID = rstmpReportID.getString(1);
}
logger.info("Checking report with id " + reportID);
String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports where reportid=?";
PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists);
stGetReportID.setString(1, reportID);
ResultSet rsCheckIfReportExist = stGetReportID.executeQuery();
if (rsCheckIfReportExist.next()) {
logger.info("Report found with ID " + reportID);
dropTmpReportsTable();
} else {
String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datacitereports "
+ "SELECT\n"
+ " get_json_object(json, '$.report.id') AS reportid,\n"
+ " get_json_object(json, '$.report.report-header.report-name') AS name,\n"
+ " get_json_object(json, '$.report.report-header.report-id') AS source,\n"
+ " get_json_object(json, '$.report.report-header.release') AS release,\n"
+ " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n"
+ " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n"
+ " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertReport);
logger.info("Report added");
logger.info("Adding datasets");
String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteDatasetsArray);
ResultSet rstmpReportDatasets = stmt.getResultSet();
if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) {
String[] listDatasets = rstmpReportDatasets.getString(1).split(",");
logger.info("Datasets found " + listDatasets.length);
for (int i = 0; i < listDatasets.length; i++) {
String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datasets "
+ "SELECT\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-id[0].value') AS ds_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-title') AS ds_title,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].publisher-id.type[0]') AS publisher_id_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].publisher-id.value[0]') AS publisher_id_value,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-dates.type[0]') AS ds_dates_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-dates.value[0]') AS ds_dates_value,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-contributors') AS ds_contributors,\n"
+ " get_json_object(json, '$.report.id') AS reportid \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertDataSets);
logger.info("Dataset added " + i);
logger.info("Adding Dataset Performance");
String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets["
+ i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteDatasetsPerformance);
ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet();
if (rstmpReportDatasetsPerformance.next()
&& rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) {
String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(",");
logger.info("Datasets Performance found " + listDatasetsPerformance.length);
for (int j = 0; j < listDatasetsPerformance.length; j++) {
String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets["
+ i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".tmpjson";
stmt.execute(sqlSelecteDatasetsPerformanceInstance);
ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet();
if (rstmpReportDatasetsPerformanceInstance.next()
&& rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) {
String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance
.getString(1)
.split(",");
logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length);
for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) {
String sqlInsertDataSetsPerformance = "INSERT INTO "
+ ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance "
+ "SELECT\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-id[0].value') AS ds_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].period.end-date') AS period_end,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].period.begin-date') AS period_from,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].access-method') AS access_method,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].metric-type') AS metric_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k + "].count') AS count,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].country-counts') AS country_counts,\n"
+ " get_json_object(json, '$.report.id') AS reportid \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertDataSetsPerformance);
}
}
}
}
logger.info("DatasetPerformance added for dataset" + i);
}
}
logger.info("Adding gzip performance");
String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteReportSubsets);
ResultSet rstmpReportSubsets = stmt.getResultSet();
if (rstmpReportSubsets.next()) {
String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1));
this.readCompressedReport(unCompressedReport, reportID);
}
}
}
this.dropTmpReportsTable();
}
public void readCompressedReport(String report, String reportId) throws Exception {
Gson gson = new Gson();
JsonObject jsonObject = gson.fromJson(report, JsonObject.class);
JsonArray jsonReportDatasets;
if (jsonObject.getAsJsonArray("report_datasets") != null) {
jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets");
} else {
jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets");
}
for (JsonElement datasetElement : jsonReportDatasets) {
// JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title");
String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString();
String yop = datasetElement.getAsJsonObject().get("yop").getAsString();
String uri = datasetElement.getAsJsonObject().get("uri").getAsString();
String platform = datasetElement.getAsJsonObject().get("platform").getAsString();
String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString();
String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString();
JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id");
String publisher_id_type = "";
String publisher_id_value = "";
for (JsonElement publisher_id_Element : publisher_id) {
publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString();
publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString();
}
JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates");
String ds_dates_type = "";
String ds_dates_value = "";
for (JsonElement datasetDaysElement : dataset_days) {
ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString();
ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString();
}
JsonArray datasetContributors = null;
String ds_contributor_type = "";
String[] ds_contributor_values = null;
Array ds_contributor_valuesArr = null;
if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) {
datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors");
JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id");
String doi = "";
for (JsonElement datasetIDElement : datasetid)
//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString());
{
doi = datasetIDElement.getAsJsonObject().get("value").getAsString();
}
String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datasets(ds_type,"
+ "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value,"
+ "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) ";
PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset);
pstmtDataset.setString(1, doi);
pstmtDataset.setString(2, dataset_title);
pstmtDataset.setString(3, yop);
pstmtDataset.setString(4, uri);
pstmtDataset.setString(5, platform);
pstmtDataset.setString(6, data_type);
pstmtDataset.setString(7, publisher);
pstmtDataset.setString(8, publisher_id_type);
pstmtDataset.setString(9, publisher_id_value);
pstmtDataset.setString(10, ds_dates_type);
pstmtDataset.setString(11, ds_dates_value);
pstmtDataset.setString(13, datasetContributors.getAsString());
pstmtDataset.setString(14, reportId);
pstmtDataset.execute();
logger.info("Dataset from compressed report addded " + doi);
/*
* JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for
* (JsonElement performanceElement : performance) { JsonObject period =
* performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date =
* period.getAsJsonObject().get("end-date").getAsString(); String begin_date =
* period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance =
* performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement :
* instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject
* country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set<String>
* keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[]
* country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0;
* while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] =
* country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text",
* country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype
* = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod =
* instanceElement.getAsJsonObject().get("access-method").getAsString(); String
* sqlInsertDatasetPerformance =
* "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)"
* ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance);
* //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count);
* pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date);
* pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod);
* pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count);
* pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8,
* countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute();
* } }
*/
}
}
}
private ArrayList<String> listHdfsDir(String dir) throws Exception {
FileSystem hdfs = FileSystem.get(new Configuration());
RemoteIterator<LocatedFileStatus> Files;
ArrayList<String> fileNames = new ArrayList<>();
try {
Path exportPath = new Path(hdfs.getUri() + dir);
Files = hdfs.listFiles(exportPath, false);
while (Files.hasNext()) {
String fileName = Files.next().getPath().toString();
fileNames.add(fileName);
}
hdfs.close();
} catch (Exception e) {
logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir));
throw new Exception("HDFS file path with exported data does not exist : " + dir, e);
}
return fileNames;
}
private String readHDFSFile(String filename) throws Exception {
String result;
try {
FileSystem fs = FileSystem.get(new Configuration());
// log.info("reading file : " + filename);
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while (line != null) {
sb.append(line);
// sb.append(line);
line = br.readLine();
}
// result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
result = sb.toString().trim();
// fs.close();
} catch (Exception e) {
throw new Exception(e);
}
return result;
}
public static String uncompressString(String zippedBase64Str)
throws IOException {
String result = null;
// In my solr project, I use org.apache.solr.common.util.Base64.
// byte[] bytes =
// org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str);
byte[] bytes = Base64.getDecoder().decode(zippedBase64Str);
GZIPInputStream zi = null;
try {
zi = new GZIPInputStream(new ByteArrayInputStream(bytes));
result = IOUtils.toString(zi);
} finally {
IOUtils.closeQuietly(zi);
}
return result;
}
private void createTmpReportsTable(String jsonFile) throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
dropTmpReportsTable();
String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".tmpjson (json STRING)";
stmt.executeUpdate(createTmpTable);
logger.info("Tmp Table Created");
String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(insertJsonReport);
logger.info("JSON Report File inserted to tmpjson Table");
}
private void dropTmpReportsTable() throws SQLException {
logger.info("Dropping tmpjson Table");
String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
Statement stmt = ConnectDB.getHiveConnection().createStatement();
stmt.executeUpdate(dropTmpTable);
logger.info("Dropped tmpjson Table");
}
}
/*
* PreparedStatement prepStatem = conn.
* prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)"
* );
*/

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.io.IOException;
@ -43,194 +44,68 @@ public class UsageStatsExporter {
ConnectDB.getHiveConnection();
if (ExecuteWorkflow.recreateDbAndTables) {
createDatabase();
createTables();
reCreateLogDirs();
DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", "");
datasetsDB.recreateDBAndTables();
}
logger.info("Initializing the download logs module");
DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath);
DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL,
ExecuteWorkflow.dataciteReportPath);
if (ExecuteWorkflow.datasetsEmptyDirs) {
logger.info("Downloading Reports List From Datacite");
drfd.downloadReportsList();
logger.info("Reports List has been downloaded");
}
}
private void createDatabase() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Reports table - This table should exist
String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports(reportid STRING, \n"
+ " name STRING, \n"
+ " source STRING,\n"
+ " release STRING,\n"
+ " createdby STRING,\n"
+ " report_end_date STRING,\n"
+ " report_start_date STRING)\n"
+ " CLUSTERED BY (reportid)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataciteeReports);
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite(
ExecuteWorkflow.dataciteReportPath);
logger.info("Store Reports To DB");
readReportsListFromDatacite.readReports();
logger.info("Reports Stored To DB");
}
// runImpalaQuery();
/*
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Re-creating database and tables");
logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
if (ExecuteWorkflow.piwikEmptyDirs) {
logger.info("Recreating Piwik log directories");
piwikstatsdb.reCreateLogDirs();
}
// Downloading piwik logs (also managing directory creation)
if (ExecuteWorkflow.downloadPiwikLogs) {
logger.info("Downloading piwik logs");
piwd
.GetOpenAIRELogs(
ExecuteWorkflow.repoLogPath,
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
}
logger.info("Downloaded piwik logs");
// Create DB tables, insert/update statistics
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
if (ExecuteWorkflow.processPiwikLogs) {
logger.info("Processing logs");
piwikstatsdb.processLogs();
}
logger.info("Creating LaReferencia tables");
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
ExecuteWorkflow.lareferenciaAuthToken);
if (ExecuteWorkflow.laReferenciaEmptyDirs) {
logger.info("Recreating LaReferencia log directories");
lrf.reCreateLogDirs();
}
if (ExecuteWorkflow.downloadLaReferenciaLogs) {
logger.info("Downloading LaReferencia logs");
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs");
}
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
if (ExecuteWorkflow.processLaReferenciaLogs) {
logger.info("Processing LaReferencia logs");
lastats.processLogs();
logger.info("LaReferencia logs done");
}
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
logger.info("Creating Irus Stats tables");
irusstats.createTables();
logger.info("Created Irus Stats tables");
logger.info("Re-create log dirs");
irusstats.reCreateLogDirs();
logger.info("Re-created log dirs");
}
if (ExecuteWorkflow.irusDownloadReports) {
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
}
if (ExecuteWorkflow.irusProcessStats) {
irusstats.processIrusStats();
logger.info("Irus done");
}
SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
sarcStats.reCreateLogDirs();
}
if (ExecuteWorkflow.sarcDownloadReports) {
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
}
if (ExecuteWorkflow.sarcProcessStats) {
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
sarcStats.finalizeSarcStats();
}
logger.info("Sarc done");
// finalize usagestats
if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats();
logger.info("Finalized stats");
}
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
logger.info("End");
* PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
* logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module");
* PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
* if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories");
* piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if
* (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs(
* ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); }
* logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl =
* "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
* piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) {
* logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables");
* LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
* ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) {
* logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if
* (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs");
* lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); }
* LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if
* (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs();
* logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if
* (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables");
* irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs");
* irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) {
* irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) {
* irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if
* (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if
* (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray,
* ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) {
* sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
* sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if
* (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the
* tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) {
* logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End");
*/
}
/*
private void invalidateMetadata() throws SQLException {
Statement stmt = null;
stmt = ConnectDB.getImpalaConnection().createStatement();
String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
}
* private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt =
* ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " +
* ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA "
* + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
* ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
* ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close();
* ConnectDB.getHiveConnection().close(); }
*/

View File

@ -23,7 +23,35 @@
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-raw-data-update</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata

View File

@ -62,7 +62,6 @@ public class ExecuteWorkflow {
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats;
static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads;
@ -98,98 +97,108 @@ public class ExecuteWorkflow {
usageStatsDBSchema = parser.get("usageStatsDBSchema");
statsDBSchema = parser.get("statsDBSchema");
if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
recreateDbAndTables = true;
else
} else {
recreateDbAndTables = false;
}
if (parser.get("piwikEmptyDirs").toLowerCase().equals("true"))
if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) {
piwikEmptyDirs = true;
else
} else {
piwikEmptyDirs = false;
}
if (parser.get("downloadPiwikLogs").toLowerCase().equals("true"))
if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) {
downloadPiwikLogs = true;
else
} else {
downloadPiwikLogs = false;
}
if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
processPiwikLogs = true;
else
} else {
processPiwikLogs = false;
}
String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
String endingLogPeriodStr = parser.get("endingLogPeriod");
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
// String endingLogPeriodStr = parser.get("endingLogPeriod");
// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true"))
if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) {
laReferenciaEmptyDirs = true;
else
} else {
laReferenciaEmptyDirs = false;
}
if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true"))
if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) {
downloadLaReferenciaLogs = true;
else
} else {
downloadLaReferenciaLogs = false;
}
if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true"))
if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
processLaReferenciaLogs = true;
else
} else {
processLaReferenciaLogs = false;
}
if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true"))
if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) {
irusCreateTablesEmptyDirs = true;
else
} else {
irusCreateTablesEmptyDirs = false;
}
if (parser.get("irusDownloadReports").toLowerCase().equals("true"))
if (parser.get("irusDownloadReports").toLowerCase().equals("true")) {
irusDownloadReports = true;
else
} else {
irusDownloadReports = false;
}
if (parser.get("irusProcessStats").toLowerCase().equals("true"))
if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
irusProcessStats = true;
else
} else {
irusProcessStats = false;
}
irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true"))
if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) {
sarcCreateTablesEmptyDirs = true;
else
} else {
sarcCreateTablesEmptyDirs = false;
}
if (parser.get("sarcDownloadReports").toLowerCase().equals("true"))
if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) {
sarcDownloadReports = true;
else
} else {
sarcDownloadReports = false;
}
if (parser.get("sarcProcessStats").toLowerCase().equals("true"))
if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
sarcProcessStats = true;
else
} else {
sarcProcessStats = false;
}
sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
/*
if (parser.get("finalizeStats").toLowerCase().equals("true"))
if (parser.get("finalizeStats").toLowerCase().equals("true")) {
finalizeStats = true;
else
} else {
finalizeStats = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
finalTablesVisibleToImpala = true;
else
finalTablesVisibleToImpala = false;
*/
}
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
// usagestatsExport.createdDBWithTablesOnly();
}
private static Calendar startingLogPeriodStr(Date date) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@ -58,17 +59,6 @@ public class IrusStats {
stmt.executeUpdate(sqlCreateTableSushiLog);
logger.info("Created sushilog");
// To see how to apply to the ignore duplicate rules and indexes
// stmt.executeUpdate(sqlCreateTableSushiLog);
// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO sushilog "
// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
// + "sushilog.rid, sushilog.date "
// + "FROM sushilog "
// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
// stmt.executeUpdate(sqlcreateRuleSushiLog);
// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
// stmt.executeUpdate(createSushiIndex);
stmt.close();
ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created");
@ -78,33 +68,6 @@ public class IrusStats {
}
}
// // The following may not be needed - It will be created when JSON tables are created
// private void createTmpTables() throws Exception {
// try {
//
// Statement stmt = ConnectDB.getConnection().createStatement();
// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
// stmt.executeUpdate(sqlCreateTableSushiLog);
//
// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
// // stmt.executeUpdate(sqlCopyPublicSushiLog);
// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO sushilogtmp "
// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
// + "sushilogtmp.rid, sushilogtmp.date "
// + "FROM sushilogtmp "
// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
// stmt.executeUpdate(sqlcreateRuleSushiLog);
//
// stmt.close();
// ConnectDB.getConnection().close();
// log.info("Sushi Tmp Tables Created");
// } catch (Exception e) {
// log.error("Failed to create tables: " + e);
// throw new Exception("Failed to create tables: " + e.toString(), e);
// }
// }
public void processIrusStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
@ -174,42 +137,7 @@ public class IrusStats {
+ "WHERE `ItemIdent`.`Type`= 'OAI'";
stmt.executeUpdate(insertSushilogtmp);
logger.info("Inserted to irus_sushilogtmp table");
/*
logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
+ "`date` string, "
+ "`count` bigint, "
+ "`openaire` bigint)";
stmt.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats table");
logger.info("Inserting into downloads_stats");
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT s.source, d.id AS repository_id, "
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
stmt.executeUpdate(insertDStats);
logger.info("Inserted into downloads_stats");
logger.info("Creating sushilog table");
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`rid` string, "
+ "`date` string, "
+ "`metric_type` string, "
+ "`count` int)";
stmt.executeUpdate(createSushilog);
logger.info("Created sushilog table");
*/
logger.info("Inserting to sushilog table");
String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ ConnectDB.getUsageStatsDBSchema()
@ -227,9 +155,12 @@ public class IrusStats {
logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
@ -248,6 +179,7 @@ public class IrusStats {
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
if (jsonArray != null) {
int i = 0;
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
@ -277,8 +209,11 @@ public class IrusStats {
logger.info("Now working on openDoar: " + opendoar);
this.getIrusIRReport(opendoar, irusUKReportPath);
}
logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
} else {
logger.info("IRUS Reports not found for day");
}
}
private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
@ -294,9 +229,12 @@ public class IrusStats {
logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
@ -317,11 +255,12 @@ public class IrusStats {
rs_date.close();
int batch_size = 0;
if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
} else {
start.add(Calendar.MONTH, 1);
while (start.before(end)) {
logger.info("date: " + simpleDateFormat.format(start.getTime()));
logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime()));
String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ "&RepositoryIdentifier=opendoar%3A" + opendoar

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@ -183,9 +184,12 @@ public class LaReferenciaDownloadLogs {
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
PreparedStatement st = ConnectDB
@ -209,7 +213,10 @@ public class LaReferenciaDownloadLogs {
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
Date date = currDay.getTime();
if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID);
logger
.info(
"Date found in logs " + dateMax + " and not downloanding Matomo logs for "
+ laReferencialMatomoID);
} else {
logger
.info(
@ -223,7 +230,8 @@ public class LaReferenciaDownloadLogs {
FileSystem fs = FileSystem.get(new Configuration());
FSDataOutputStream fin = fs
.create(
new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
new Path(
outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
true);
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format

View File

@ -61,15 +61,6 @@ public class LaReferenciaStats {
"stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables");
// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialog "
// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
// + "FROM lareferencialog "
// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
stmt.close();
ConnectDB.getHiveConnection().close();
@ -82,30 +73,6 @@ public class LaReferenciaStats {
}
}
// private void createTmpTables() throws Exception {
//
// try {
// Statement stmt = ConnectDB.getConnection().createStatement();
// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialogtmp "
// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
// + "FROM lareferencialogtmp "
// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
//
// stmt.close();
// log.info("Lareferencia Tmp Tables Created");
//
// } catch (Exception e) {
// log.error("Failed to create tmptables: " + e);
// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
// // System.exit(0);
// }
// }
public void processLogs() throws Exception {
try {
logger.info("Processing LaReferencia repository logs");
@ -116,15 +83,6 @@ public class LaReferenciaStats {
removeDoubleClicks();
logger.info("LaReferencia removed double clicks");
/********
logger.info("LaReferencia creating viewsStats");
viewsStats();
logger.info("LaReferencia created viewsStats");
logger.info("LaReferencia creating downloadsStats");
downloadsStats();
logger.info("LaReferencia created downloadsStats");
************/
logger.info("LaReferencia updating Production Tables");
updateProdTables();
logger.info("LaReferencia updated Production Tables");
@ -255,88 +213,6 @@ public class LaReferenciaStats {
// conn.close();
}
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating la_result_views_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS "
+
"SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='action' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created la_result_views_monthly_tmp view");
logger.info("Dropping la_views_stats_tmp table");
sql = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".la_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped la_views_stats_tmp table");
logger.info("Creating la_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " +
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(views) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.oid AND p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month";
stmt.executeUpdate(sql);
logger.info("Created la_views_stats_tmp table");
stmt.close();
ConnectDB.getHiveConnection().close();
}
private void downloadsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating la_result_downloads_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".la_result_downloads_monthly_tmp AS " +
"SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='download' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created la_result_downloads_monthly_tmp view");
logger.info("Dropping la_downloads_stats_tmp table");
sql = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped la_downloads_stats_tmp table");
logger.info("Creating la_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " +
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(downloads) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.oid AND p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month";
stmt.executeUpdate(sql);
logger.info("Created la_downloads_stats_tmp table");
stmt.close();
ConnectDB.getHiveConnection().close();
}
private void updateProdTables() throws SQLException, Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
@ -346,36 +222,7 @@ public class LaReferenciaStats {
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
stmt.executeUpdate(sql);
/*****
logger.info("Updating views_stats");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
stmt.executeUpdate(sql);
// sql = "insert into public.views_stats select * from la_views_stats_tmp;";
// stmt.executeUpdate(sql);
logger.info("Updating downloads_stats");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserting data to usage_stats from lareferencia");
sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
"SELECT coalesce(ds.source, vs.source) as source, " +
"coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
"coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
"coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
"coalesce(ds.openaire, 0) as openaire_downloads, " +
"coalesce(vs.openaire, 0) as openaire_views " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " +
ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " +
"AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
stmt.executeUpdate(sql);
logger.info("Inserted data to usage_stats from lareferencia");
// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;";
// stmt.executeUpdate(sql);
****/
logger.info("Dropping lareferencialogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
logger.info("Dropped lareferencialogtmp");

View File

@ -1,9 +1,12 @@
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
import java.net.Authenticator;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
@ -193,11 +196,9 @@ public class PiwikDownloadLogs {
// Getting all the piwikids in a list for logging reasons & limitting the list
// to the max number of piwikids
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
//while (rs.next())
//piwikIdToVisit.add(rs.getInt(1));
piwikIdToVisit.add(13);
piwikIdToVisit.add(109);
while (rs.next()) {
piwikIdToVisit.add(rs.getInt(1));
}
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
@ -215,9 +216,11 @@ public class PiwikDownloadLogs {
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
logger.info("Now working on piwikId: " + siteId);

View File

@ -86,6 +86,7 @@ public class PiwikStatsDB {
logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
@ -117,10 +118,15 @@ public class PiwikStatsDB {
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLog);
// String dropT = "TRUNCATE TABLE "
// + ConnectDB.getUsageStatsDBSchema()
// + ".piwiklog ";
// stmt.executeUpdate(dropT);
// logger.info("truncated piwiklog");
/////////////////////////////////////////
// Rule for duplicate inserts @ piwiklog
/////////////////////////////////////////
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
@ -131,7 +137,6 @@ public class PiwikStatsDB {
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log
//////////////////////////////////////////////////
stmt.close();
ConnectDB.getHiveConnection().close();
@ -141,47 +146,6 @@ public class PiwikStatsDB {
}
}
/***** public void createTmpTables() throws Exception {
try {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ piwiklogtmp
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// Copy from public.piwiklog to piwiklog
//////////////////////////////////////////////////
// String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
// stmt.executeUpdate(sqlCopyPublicPiwiklog);
String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTmpTablePortalLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log_tmp
//////////////////////////////////////////////////
stmt.close();
} catch (Exception e) {
logger.error("Failed to create tmptables: " + e);
throw new Exception("Failed to create tmp tables: " + e.toString(), e);
// System.exit(0);
}
}
******/
public void processLogs() throws Exception {
try {
ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
@ -204,22 +168,16 @@ public class PiwikStatsDB {
logger.info("Portal logs process done");
logger.info("Processing portal usagestats");
portalStats();
portalLogs();
logger.info("Portal usagestats process done");
/*****
logger.info("ViewsStats processing starts");
viewsStats();
logger.info("ViewsStats processing ends");
logger.info("DownloadsStats processing starts");
downloadsStats();
logger.info("DownloadsStats processing starts");
*****/
logger.info("Updating Production Tables");
updateProdTables();
logger.info("Updated Production Tables");
logger.info("Create Pedocs Tables");
createPedocsOldUsageData();
logger.info("Pedocs Tables Created");
} catch (Exception e) {
logger.error("Failed to process logs: " + e);
@ -237,65 +195,65 @@ public class PiwikStatsDB {
logger.info("Added JSON Serde jar");
logger.info("Dropping piwiklogtmp_json table");
String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".piwiklogtmp_json";
String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp_json";
stmt.executeUpdate(drop_piwiklogtmp_json);
logger.info("Dropped piwiklogtmp_json table");
logger.info("Creating piwiklogtmp_json");
String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".piwiklogtmp_json(\n" +
" `idSite` STRING,\n" +
" `idVisit` STRING,\n" +
" `country` STRING,\n" +
" `referrerName` STRING,\n" +
" `browser` STRING,\n" +
" `actionDetails` ARRAY<\n" +
" struct<\n" +
" type: STRING,\n" +
" url: STRING,\n" +
" `customVariables`: struct<\n" +
" `1`: struct<\n" +
" `customVariablePageValue1`: STRING\n" +
" >\n" +
" >,\n" +
" timestamp: String\n" +
" >\n" +
" >\n" +
")\n" +
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
"LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" +
"TBLPROPERTIES (\"transactional\"=\"false\")";
String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp_json(\n"
+ " `idSite` STRING,\n"
+ " `idVisit` STRING,\n"
+ " `country` STRING,\n"
+ " `referrerName` STRING,\n"
+ " `browser` STRING,\n"
+ " `actionDetails` ARRAY<\n"
+ " struct<\n"
+ " type: STRING,\n"
+ " url: STRING,\n"
+ " `customVariables`: struct<\n"
+ " `1`: struct<\n"
+ " `customVariablePageValue1`: STRING\n"
+ " >\n"
+ " >,\n"
+ " timestamp: String\n"
+ " >\n"
+ " >\n"
+ ")\n"
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n"
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_piwiklogtmp_json);
logger.info("Created piwiklogtmp_json");
logger.info("Dropping piwiklogtmp table");
String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".piwiklogtmp";
String drop_piwiklogtmp = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp";
stmt.executeUpdate(drop_piwiklogtmp);
logger.info("Dropped piwiklogtmp");
logger.info("Creating piwiklogtmp");
String create_piwiklogtmp = "CREATE TABLE " +
ConnectDB.getUsageStatsDBSchema() +
".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " +
"entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
"clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
String create_piwiklogtmp = "CREATE TABLE "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_piwiklogtmp);
logger.info("Created piwiklogtmp");
logger.info("Inserting into piwiklogtmp");
String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " +
"actiondetail.type as action, actiondetail.url as url, " +
"actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
"'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
"referrerName as referrer_name, browser as agent\n" +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" +
"LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, "
+ "actiondetail.type as action, actiondetail.url as url, "
+ "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, "
+ "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, "
+ "referrerName as referrer_name, browser as agent\n"
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n"
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_piwiklogtmp);
logger.info("Inserted into piwiklogtmp");
@ -308,33 +266,31 @@ public class PiwikStatsDB {
logger.info("Cleaning download double clicks");
// clean download double clicks
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"WHERE EXISTS (\n" +
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+
"AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" +
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 \n" +
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "WHERE EXISTS (\n"
+ "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, "
+ ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n"
+ "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n"
+ "AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 \n"
+ "AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n"
+ "AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql);
logger.info("Cleaned download double clicks");
// clean view double clicks
logger.info("Cleaning action double clicks");
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"WHERE EXISTS (\n" +
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n" +
"AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n" +
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "WHERE EXISTS (\n"
+ "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, "
+ ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n"
+ "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ "AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n"
+ "AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n"
+ "AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n"
+ "AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql);
logger.info("Cleaned action double clicks");
stmt.close();
@ -349,136 +305,107 @@ public class PiwikStatsDB {
logger.info("Added JSON Serde jar");
logger.info("Dropping process_portal_log_tmp_json table");
String drop_process_portal_log_tmp_json = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".process_portal_log_tmp_json";
String drop_process_portal_log_tmp_json = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp_json";
stmt.executeUpdate(drop_process_portal_log_tmp_json);
logger.info("Dropped process_portal_log_tmp_json table");
logger.info("Creating process_portal_log_tmp_json");
String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json(" +
" `idSite` STRING,\n" +
" `idVisit` STRING,\n" +
" `country` STRING,\n" +
" `referrerName` STRING,\n" +
" `browser` STRING,\n" +
" `actionDetails` ARRAY<\n" +
" struct<\n" +
" type: STRING,\n" +
" url: STRING,\n" +
" timestamp: String\n" +
" >\n" +
" >\n" +
")\n" +
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
"LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" +
"TBLPROPERTIES (\"transactional\"=\"false\")";
String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json("
+ " `idSite` STRING,\n"
+ " `idVisit` STRING,\n"
+ " `country` STRING,\n"
+ " `referrerName` STRING,\n"
+ " `browser` STRING,\n"
+ " `actionDetails` ARRAY<\n"
+ " struct<\n"
+ " type: STRING,\n"
+ " url: STRING,\n"
+ " timestamp: String\n"
+ " >\n"
+ " >\n"
+ ")\n"
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n"
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_process_portal_log_tmp_json);
logger.info("Created process_portal_log_tmp_json");
logger.info("Droping process_portal_log_tmp table");
String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".process_portal_log_tmp";
String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp";
stmt.executeUpdate(drop_process_portal_log_tmp);
logger.info("Dropped process_portal_log_tmp");
logger.info("Creating process_portal_log_tmp");
String create_process_portal_log_tmp = "CREATE TABLE " +
ConnectDB.getUsageStatsDBSchema() +
".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " +
"entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
"clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
String create_process_portal_log_tmp = "CREATE TABLE "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_process_portal_log_tmp);
logger.info("Created process_portal_log_tmp");
logger.info("Inserting into process_portal_log_tmp");
String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp " +
"SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+
"actiondetail.url as url, " +
"CASE\n" +
" WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " +
" WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " +
" WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
+
" WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " +
" WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " +
" WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " +
" WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " +
" ELSE '' " +
"END AS entity_id, " +
"CASE " +
" WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " +
" WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " +
" WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " +
" WHEN (actiondetail.url like '%articleId=%') THEN 'result' " +
" WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " +
" WHEN (actiondetail.url like '%projectId=%') THEN 'project' " +
" WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " +
" ELSE '' " +
"END AS source_item_type, " +
"from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " +
"browser as agent " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " +
"LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ ".process_portal_log_tmp "
+ "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+ "actiondetail.url as url, "
+ "CASE\n"
+ " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] "
+ " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] "
+ " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
+ " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] "
+ " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] "
+ " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] "
+ " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] "
+ " ELSE '' "
+ "END AS entity_id, "
+ "CASE "
+ " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' "
+ " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' "
+ " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' "
+ " WHEN (actiondetail.url like '%articleId=%') THEN 'result' "
+ " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' "
+ " WHEN (actiondetail.url like '%projectId=%') THEN 'project' "
+ " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' "
+ " ELSE '' "
+ "END AS source_item_type, "
+ "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, "
+ "browser as agent "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json "
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_process_portal_log_tmp);
logger.info("Inserted into process_portal_log_tmp");
stmt.close();
}
public void portalStats() throws SQLException {
public void portalLogs() throws SQLException {
Connection con = ConnectDB.getHiveConnection();
Statement stmt = con.createStatement();
con.setAutoCommit(false);
// Original queries where of the style
//
// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp2,
// openaire_prod_stats_20200821.result_oids roid
// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null
//
// The following query is an example of how queries should be
//
//
// INSERT INTO usagestats_20200907.piwiklogtmp
// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp
// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id
// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL);
//
// We should consider if we would like the queries to be as the following
//
// INSERT INTO usagestats_20200907.piwiklogtmp
// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp
// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id
// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND
// roid.oid != '');
logger.info("PortalStats - Step 1");
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".result_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
logger.info("PortalStats - Step 2");
stmt = con.createStatement();
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".datasource_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
@ -494,12 +421,11 @@ public class PiwikStatsDB {
*/
logger.info("PortalStats - Step 3");
stmt = con.createStatement();
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".project_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql);
stmt.close();
@ -512,233 +438,233 @@ public class PiwikStatsDB {
logger.info("Cleaning oai - Step 1");
stmt = ConnectDB.getHiveConnection().createStatement();
String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," +
"'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/',"
+ "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 2");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," +
"'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/',"
+ "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 3");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," +
"'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/',"
+ "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 4");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," +
"'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/',"
+ "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 5");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," +
"'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/',"
+ "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 6");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," +
"'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/',"
+ "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 7");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," +
"'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/',"
+ "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 8");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," +
"'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/',"
+ "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 9");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," +
"'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/',"
+ "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 10");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," +
"'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/',"
+ "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 11");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," +
"'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/',"
+ "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 12");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," +
"'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/',"
+ "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 13");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," +
"'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/',"
+ "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 14");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," +
"'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/',"
+ "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 15");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," +
"'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/',"
+ "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 16");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," +
"'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/',"
+ "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 17");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," +
"'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/',"
+ "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 18");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," +
"'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/',"
+ "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 19");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," +
"'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/',"
+ "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 20");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," +
"'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/',"
+ "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 21");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," +
"'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/',"
+ "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 22");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," +
"'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/',"
+ "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 23");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," +
"'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/',"
+ "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 24");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," +
"'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/',"
+ "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 25");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," +
"'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/',"
+ "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 26");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," +
"'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/',"
+ "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 27");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," +
"'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/',"
+ "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 28");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," +
"'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/',"
+ "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
logger.info("Cleaning oai - Step 29");
stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," +
"'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/',"
+ "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
stmt.executeUpdate(sql);
stmt.close();
@ -746,52 +672,13 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().close();
}
private String processPortalURL(String url) {
if (url.indexOf("explore.openaire.eu") > 0) {
try {
url = URLDecoder.decode(url, "UTF-8");
} catch (Exception e) {
logger.info("Error when decoding the following URL: " + url);
}
if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
url = "datasource|"
+ url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
} else if (url.indexOf("datasource=") > 0
&& url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
} else if (url.indexOf("datasourceFilter=") > 0
&& url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
url = "datasource|"
+ url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
} else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
} else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
} else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46
&& !url.contains("oai:dnet:corda")) {
url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
} else if (url.indexOf("organizationId=") > 0
&& url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
url = "organization|"
+ url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
} else {
url = "";
}
} else {
url = "";
}
return url;
}
private void updateProdTables() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Inserting data to piwiklog");
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
logger.info("Dropping piwiklogtmp");
@ -809,6 +696,65 @@ public class PiwikStatsDB {
}
public void finalizeStats() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping piwiklogtmp");
String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped piwiklogtmp");
logger.info("Dropping process_portal_log_tmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped process_portal_log_tmp");
logger.info("Dropping irus_sushilogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped irus_sushilogtmp");
logger.info("Dropping irus_sushilogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped irus_sushilogtmp_json");
logger.info("Dropping lareferencialogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped lareferencialogtmp_json");
logger.info("Dropping piwiklogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped piwiklogtmp_json");
logger.info("Dropping process_portal_log_tmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped process_portal_log_tmp_json");
logger.info("Dropping sarc_sushilogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp");
logger.info("Dropping sarc_sushilogtmp_json_array");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp_json_array");
logger.info("Dropping sarc_sushilogtmp_json_non_array");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp_json_non_array");
stmt.close();
ConnectDB.getHiveConnection().close();
}
private ArrayList<String> listHdfsDir(String dir) throws Exception {
FileSystem hdfs = FileSystem.get(new Configuration());
@ -868,4 +814,22 @@ public class PiwikStatsDB {
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}
public void createPedocsOldUsageData() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating PeDocs Old Views Table");
String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pedocsoldviews as select * from default.pedocsviews";
stmt.executeUpdate(sql);
logger.info("PeDocs Old Views Table created");
logger.info("Creating PeDocs Old Downloads Table");
sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pedocsolddownloads as select * from default.pedocsdownloads";
stmt.executeUpdate(sql);
logger.info("PeDocs Old Downloads Table created");
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*;
@ -281,92 +282,11 @@ public class SarcStats {
}
public void finalizeSarcStats() throws Exception {
public void updateSarcLogs() throws Exception {
stmtHive = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
stmtImpala = ConnectDB.getImpalaConnection().createStatement();
/*
logger.info("Creating downloads_stats table_tmp");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_tmp "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
+ "`date` string, "
+ "`count` bigint, "
+ "`openaire` bigint)";
stmtHive.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats_tmp table");
logger.info("Dropping sarc_sushilogtmp_impala table");
String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp_impala";
stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala);
logger.info("Dropped sarc_sushilogtmp_impala table");
logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala");
String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp_impala "
+ "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmtHive.executeUpdate(createSarcSushilogtmpImpala);
logger.info("Created sarc_sushilogtmp_impala");
logger.info("Making sarc_sushilogtmp visible to impala");
String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp_impala;";
stmtImpala.executeUpdate(invalidateMetadata);
logger.info("Dropping downloads_stats_impala table");
String drop_downloads_stats_impala = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_impala";
stmtHive.executeUpdate(drop_downloads_stats_impala);
logger.info("Dropped downloads_stats_impala table");
logger.info("Making downloads_stats_impala deletion visible to impala");
try {
String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_impala;";
stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala);
} catch (SQLException sqle) {
}
// We run the following query in Impala because it is faster
logger.info("Creating downloads_stats_impala");
String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_impala AS "
+ "SELECT s.source, d.id AS repository_id, "
+ "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
+ "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ ConnectDB.getStatsDBSchema() + ".result_pids ro "
+ "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
stmtImpala.executeUpdate(createDownloadsStatsImpala);
logger.info("Creating downloads_stats_impala");
// Insert into downloads_stats
logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp");
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_tmp SELECT * "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala";
stmtHive.executeUpdate(insertDStats);
logger.info("Inserted into downloads_stats_tmp");
logger.info("Creating sushilog table");
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`rid` string, "
+ "`date` string, "
+ "`metric_type` string, "
+ "`count` int)";
stmtHive.executeUpdate(createSushilog);
logger.info("Created sushilog table");
*/
// Insert into sushilog
logger.info("Inserting into sushilog");
String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
@ -389,9 +309,12 @@ public class SarcStats {
logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
// end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
@ -425,10 +348,10 @@ public class SarcStats {
org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem dfs = FileSystem.get(config);
if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) {
if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
} else {
start.add(Calendar.MONTH, 1);
while (start.before(end)) {
String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
@ -499,11 +422,13 @@ public class SarcStats {
// Check the file size and if it is too big, delete it
File fileArray = new File(filePathArray);
if (fileArray.length() == 0)
if (fileArray.length() == 0) {
fileArray.delete();
}
File fileNonArray = new File(filePathNonArray);
if (fileNonArray.length() == 0)
if (fileNonArray.length() == 0) {
fileNonArray.delete();
}
}

View File

@ -51,8 +51,6 @@ public class UsageStatsExporter {
logger.info("Initialising DB properties");
ConnectDB.init();
// runImpalaQuery();
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Re-creating database and tables");
@ -60,10 +58,6 @@ public class UsageStatsExporter {
piwikstatsdb.recreateDBAndTables();
logger.info("DB-Tables-TmpTables are created ");
}
// else {
// piwikstatsdb.createTmpTables();
// logger.info("TmpTables are created ");
// }
logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
@ -107,7 +101,6 @@ public class UsageStatsExporter {
logger.info("Downloaded LaReferencia logs");
}
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
if (ExecuteWorkflow.processLaReferenciaLogs) {
@ -116,7 +109,6 @@ public class UsageStatsExporter {
logger.info("LaReferencia logs done");
}
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
logger.info("Creating Irus Stats tables");
@ -132,14 +124,11 @@ public class UsageStatsExporter {
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
}
if (ExecuteWorkflow.irusProcessStats) {
irusstats.processIrusStats();
logger.info("Irus done");
}
SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
sarcStats.reCreateLogDirs();
@ -148,51 +137,70 @@ public class UsageStatsExporter {
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
}
if (ExecuteWorkflow.sarcProcessStats) {
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
sarcStats.finalizeSarcStats();
sarcStats.updateSarcLogs();
}
logger.info("Sarc done");
/*
// finalize usagestats
logger.info("Dropping tmp tables");
if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats();
logger.info("Finalized stats");
}
*/
/*
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
*/
logger.info("End");
logger.info("Dropped tmp tables");
}
private void invalidateMetadata() throws SQLException {
Statement stmt = null;
logger.info("Raw Data Download End");
}
stmt = ConnectDB.getImpalaConnection().createStatement();
public void createdDBWithTablesOnly() throws Exception {
logger.info("Initialising DB properties");
ConnectDB.init();
String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
piwikstatsdb.recreateDBAndTables();
piwikstatsdb.createPedocsOldUsageData();
Statement stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating LaReferencia tables");
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables");
logger.info("Creating sushilog");
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog(source STRING, "
+ "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableSushiLog);
logger.info("Created sushilog");
logger.info("Updating piwiklog");
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".piwiklog select * from openaire_prod_usage_raw.piwiklog";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
logger.info("Updating lareferencialog");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
logger.info("Updating sushilog");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog select * from openaire_prod_usage_raw.sushilog";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created");
}
}

View File

@ -125,12 +125,6 @@
"paramDescription": "Starting log period",
"paramRequired": true
},
{
"paramName": "elp",
"paramLongName": "endingLogPeriod",
"paramDescription": "Ending log period",
"paramRequired": true
},
{
"paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload",
@ -216,12 +210,6 @@
"paramDescription": "Create the usage_stats table?",
"paramRequired": true
},
{
"paramName": "ftvi",
"paramLongName": "finalTablesVisibleToImpala",
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
"paramRequired": true
},
{
"paramName": "nodt",
"paramLongName": "numberOfDownloadThreads",

View File

@ -63,7 +63,6 @@
<arg>--downloadPiwikLogs</arg><arg>${downloadPiwikLogs}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--laReferenciaEmptyDirs</arg><arg>${laReferenciaEmptyDirs}</arg>
@ -78,7 +77,6 @@
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
<capture-output/>
</java>

View File

@ -23,7 +23,35 @@
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-stats-build</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild

View File

@ -3,12 +3,17 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;
import org.apache.log4j.Logger;
@ -30,6 +35,7 @@ public abstract class ConnectDB {
private static String dbImpalaUrl;
private static String usageRawDataDBSchema;
private static String usageStatsDBSchema;
private static String usagestatsPermanentDBSchema;
private static String statsDBSchema;
private final static Logger log = Logger.getLogger(ConnectDB.class);
@ -40,6 +46,7 @@ public abstract class ConnectDB {
usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
statsDBSchema = ExecuteWorkflow.statsDBSchema;
usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema;
usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema;
Class.forName("org.apache.hive.jdbc.HiveDriver");
}
@ -65,17 +72,27 @@ public abstract class ConnectDB {
}
public static String getUsageRawDataDBSchema() {
return usageRawDataDBSchema;
return ConnectDB.usageRawDataDBSchema;
}
public static String getUsageStatsDBSchema() {
return ConnectDB.usageStatsDBSchema;
String datePattern = "YYYYMMdd";
DateFormat df = new SimpleDateFormat(datePattern);
// Get the today date using Calendar object.
Date today = Calendar.getInstance().getTime();
String todayAsString = df.format(today);
return ConnectDB.usageStatsDBSchema + "_" + todayAsString;
}
public static String getStatsDBSchema() {
return ConnectDB.statsDBSchema;
}
public static String getUsagestatsPermanentDBSchema() {
return ConnectDB.usagestatsPermanentDBSchema;
}
private static Connection connectHive() throws SQLException {
/*
* Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =

View File

@ -3,6 +3,7 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.text.SimpleDateFormat;
@ -11,57 +12,42 @@ import java.util.Date;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.BasicConfigurator;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author D. Pierrakos, S. Zoupanos
*/
public class ExecuteWorkflow {
static String matomoAuthToken;
// static String matomoAuthToken;
static String matomoBaseURL;
static String repoLogPath;
static String portalLogPath;
static String portalMatomoID;
static String irusUKBaseURL;
// static String irusUKBaseURL;
static String irusUKReportPath;
static String sarcsReportPathArray;
static String sarcsReportPathNonArray;
static String lareferenciaLogPath;
static String lareferenciaBaseURL;
static String lareferenciaAuthToken;
// static String lareferenciaBaseURL;
// static String lareferenciaAuthToken;
static String dbHiveUrl;
static String dbImpalaUrl;
static String usageRawDataDBSchema;
static String usageStatsDBSchema;
static String usagestatsPermanentDBSchema;
static String statsDBSchema;
static boolean recreateDbAndTables;
static boolean piwikEmptyDirs;
static boolean downloadPiwikLogs;
static boolean processPiwikLogs;
static Calendar startingLogPeriod;
static Calendar endingLogPeriod;
static int numberOfPiwikIdsToDownload;
static int numberOfSiteIdsToDownload;
static boolean laReferenciaEmptyDirs;
static boolean downloadLaReferenciaLogs;
static boolean processLaReferenciaLogs;
static boolean irusCreateTablesEmptyDirs;
static boolean irusDownloadReports;
static boolean irusProcessStats;
static int irusNumberOfOpendoarsToDownload;
static boolean sarcCreateTablesEmptyDirs;
static boolean sarcDownloadReports;
static boolean sarcProcessStats;
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats;
static boolean finalTablesVisibleToImpala;
@ -84,23 +70,24 @@ public class ExecuteWorkflow {
parser.parseArgument(args);
// Setting up the initial parameters
matomoAuthToken = parser.get("matomoAuthToken");
matomoBaseURL = parser.get("matomoBaseURL");
// matomoAuthToken = parser.get("matomoAuthToken");
// matomoBaseURL = parser.get("matomoBaseURL");
repoLogPath = parser.get("repoLogPath");
portalLogPath = parser.get("portalLogPath");
portalMatomoID = parser.get("portalMatomoID");
irusUKBaseURL = parser.get("irusUKBaseURL");
// irusUKBaseURL = parser.get("irusUKBaseURL");
irusUKReportPath = parser.get("irusUKReportPath");
sarcsReportPathArray = parser.get("sarcsReportPathArray");
sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
lareferenciaLogPath = parser.get("lareferenciaLogPath");
lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
// lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
// lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
dbHiveUrl = parser.get("dbHiveUrl");
dbImpalaUrl = parser.get("dbImpalaUrl");
usageRawDataDBSchema = parser.get("usageRawDataDBSchema");
usageStatsDBSchema = parser.get("usageStatsDBSchema");
usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema");
statsDBSchema = parser.get("statsDBSchema");
if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
@ -109,16 +96,13 @@ public class ExecuteWorkflow {
processPiwikLogs = false;
}
String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
String endingLogPeriodStr = parser.get("endingLogPeriod");
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
// String startingLogPeriodStr = parser.get("startingLogPeriod");
// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
//
// String endingLogPeriodStr = parser.get("endingLogPeriod");
// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
recreateDbAndTables = true;
@ -138,14 +122,11 @@ public class ExecuteWorkflow {
irusProcessStats = false;
}
irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
sarcProcessStats = true;
} else {
sarcProcessStats = false;
}
sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
if (parser.get("finalizeStats").toLowerCase().equals("true")) {
finalizeStats = true;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;
@ -34,12 +35,10 @@ public class IrusStats {
public IrusStats() throws Exception {
}
public void processIrusStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating irus_downloads_stats_tmp table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".irus_downloads_stats_tmp "
@ -67,5 +66,4 @@ public class IrusStats {
// ConnectDB.getHiveConnection().close();
}
}

View File

@ -41,8 +41,6 @@ public class LaReferenciaStats {
public LaReferenciaStats() throws Exception {
}
public void processLogs() throws Exception {
try {
logger.info("LaReferencia creating viewsStats");
@ -62,7 +60,6 @@ public class LaReferenciaStats {
}
}
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
@ -101,7 +98,7 @@ public class LaReferenciaStats {
logger.info("Created la_views_stats_tmp table");
stmt.close();
ConnectDB.getHiveConnection().close();
// ConnectDB.getHiveConnection().close();
}
private void downloadsStats() throws Exception {
@ -145,5 +142,4 @@ public class LaReferenciaStats {
// ConnectDB.getHiveConnection().close();
}
}

View File

@ -1,22 +1,15 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;
import java.net.URLDecoder;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Timestamp;
/**
* @author D. Pierrakos, S. Zoupanos
@ -29,12 +22,10 @@ public class PiwikStatsDB {
private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
public PiwikStatsDB() throws Exception {
}
public void recreateDBAndTables() throws Exception {
this.createDatabase();
// The piwiklog table is not needed since it is built
@ -43,23 +34,39 @@ public class PiwikStatsDB {
}
private void createDatabase() throws Exception {
// try {
//
// stmt = ConnectDB.getHiveConnection().createStatement();
//
// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
// stmt.executeUpdate(dropDatabase);
// } catch (Exception e) {
// logger.error("Failed to drop database: " + e);
// throw new Exception("Failed to drop database: " + e.toString(), e);
// }
//
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema());
logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS "
+ ConnectDB.getUsagestatsPermanentDBSchema();
stmt.executeUpdate(createPermanentDatabase);
logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
} catch (Exception e) {
logger.error("Failed to create database: " + e);
@ -67,7 +74,6 @@ public class PiwikStatsDB {
}
}
public void processLogs() throws Exception {
try {
@ -85,63 +91,63 @@ public class PiwikStatsDB {
}
}
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_views_monthly_tmp view");
String drop_result_views_monthly = "DROP VIEW IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".openaire_piwikresult_views_monthly_tmp";
String drop_result_views_monthly = "DROP VIEW IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".openaire_piwikresult_views_monthly_tmp";
stmt.executeUpdate(drop_result_views_monthly);
logger.info("Dropped openaire_result_views_monthly_tmp view");
logger.info("Creating openaire_result_views_monthly_tmp view");
String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_views_monthly_tmp " +
"AS SELECT entity_id AS id, " +
"COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " +
"AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".piwiklog where action='action' and (source_item_type='oaItem' or " +
"source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
+ ".openaire_result_views_monthly_tmp "
+ "AS SELECT entity_id, "
+ "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
+ "AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema()
+ ".piwiklog where action='action' and (source_item_type='oaItem' or "
+ "source_item_type='repItem') "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
+ "source ORDER BY source, entity_id";
stmt.executeUpdate(create_result_views_monthly);
logger.info("Created openaire_result_views_monthly_tmp table");
logger.info("Dropping openaire_views_stats_tmp table");
String drop_views_stats = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".openaire_views_stats_tmp";
String drop_views_stats = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp";
stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_views_stats_tmp table");
logger.info("Creating openaire_views_stats_tmp table");
String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp " +
"AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(views) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.piwik_id AND p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month ";
+ ".openaire_views_stats_tmp "
+ "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' "
+ "GROUP BY d.id, ro.id, month "
+ "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_views_stats);
logger.info("Created openaire_views_stats_tmp table");
logger.info("Creating openaire_pageviews_stats_tmp table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_pageviews_stats_tmp AS SELECT " +
"'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month ";
+ ".openaire_pageviews_stats_tmp AS SELECT "
+ "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=" + ExecuteWorkflow.portalMatomoID
+ " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' "
+ "GROUP BY d.id, ro.id, month "
+ "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table");
@ -154,44 +160,46 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".openaire_result_downloads_monthly_tmp";
String drop_result_downloads_monthly = "DROP VIEW IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_downloads_monthly_tmp";
stmt.executeUpdate(drop_result_downloads_monthly);
logger.info("Dropped openaire_result_downloads_monthly_tmp view");
logger.info("Creating openaire_result_downloads_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " +
"AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " +
"SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " +
"AND (source_item_type='oaItem' OR source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " +
"ORDER BY source, entity_id, month";
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_downloads_monthly_tmp "
+ "AS SELECT entity_id, "
+ "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
+ "COUNT(entity_id) as downloads, "
+ "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' "
+ "AND (source_item_type='oaItem' OR source_item_type='repItem') "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source "
+ "ORDER BY source, entity_id, month";
stmt.executeUpdate(sql);
logger.info("Created openaire_result_downloads_monthly_tmp view");
logger.info("Dropping openaire_downloads_stats_tmp table");
String drop_views_stats = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".openaire_downloads_stats_tmp";
String drop_views_stats = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".openaire_downloads_stats_tmp";
stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_downloads_stats_tmp table");
logger.info("Creating openaire_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " +
"SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(downloads) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.piwik_id and p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month ";
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(downloads) AS count, max(openaire_referrer) AS openaire "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' "
+ "GROUP BY d.id, ro.id, month "
+ "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(sql);
logger.info("Created downloads_stats table");
logger.info("Dropping openaire_result_downloads_monthly_tmp view");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp";
logger.info("Dropped openaire_result_downloads_monthly_tmp view ");
@ -201,6 +209,134 @@ public class PiwikStatsDB {
// ConnectDB.getHiveConnection().close();
}
public void uploadOldPedocs() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
// Dropping Pedocs pedocs_views_stats_tmp table
logger.info("Dropping Pedocs pedocs_views_stats_tmp table");
String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
logger.info("Dropped pedocs_views_stats_tmp table ");
stmt.executeUpdate(sql);
// Dropping Pedocs pedocs_downloads_stats table
logger.info("Dropping pedocs_downloads_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats";
logger.info("Dropped pedocs_downloads_stats table ");
stmt.executeUpdate(sql);
// Creating Pedocs pedocs_views_stats_tmp table
logger.info("Creating Pedocs pedocs_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ "r.id as result_id,date,counter_abstract as count, 0 as openaire "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.identifier";
stmt.executeUpdate(sql);
logger.info("Created pedocs_views_stats_tmp table ");
// Creating Pedocs pedocs_downloads_stats_tmp table
logger.info("Creating Pedocs pedocs_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ "r.id as result_id, date, counter as count, 0 as openaire "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.identifier";
stmt.executeUpdate(sql);
logger.info("Created pedocs_downloads_stats_tmp table ");
}
public void uploadTUDELFTStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
logger.info("Dropped tudelft_result_views_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_views_stats_tmp table
logger.info("Dropping TUDELFT tudelft_views_stats_tmp table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
logger.info("Dropped tudelft_views_stats_tmp table ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_downloads_stats_tmp table
logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
logger.info("Dropped tudelft_downloads_stats_tmp table ");
stmt.executeUpdate(sql);
// Creating TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp "
+ "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created tudelft_result_views_monthly_tmp view ");
// Creating TUDELFT tudelft_views_stats_tmp table
logger.info("Creating TUDELFT tudelft_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".tudelft_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
stmt.executeUpdate(sql);
logger.info("Created TUDELFT tudelft_views_stats_tmp table");
// Creating TUDELFT tudelft_result_downloads_monthly_tmp view
logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp "
+ "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created tudelft_result_downloads_monthly_tmp view ");
// Creating TUDELFT tudelft_downloads_stats_tmp table
logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".tudelft_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
stmt.executeUpdate(sql);
logger.info("Created TUDELFT tudelft_downloads_stats_tmp table");
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
logger.info("Dropped tudelft_result_views_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
}
public void finalizeStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
@ -223,83 +359,116 @@ public class PiwikStatsDB {
logger.info("Dropped pageviews_stats table ");
stmt.executeUpdate(sql);
// Dropping usage_stats table
logger.info("Dropping usage_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
logger.info("Dropped usage_stats table ");
stmt.executeUpdate(sql);
// Creating views_stats table
logger.info("Creating views_stats table");
String createViewsStats = "CREATE TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".views_stats " +
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET";
String createViewsStats = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".views_stats "
+ "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createViewsStats);
logger.info("Created views_stats table");
// Inserting OpenAIRE views stats
logger.info("Inserting Openaire data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Openaire views updated to views_stats");
// Inserting Pedocs old views stats
logger.info("Inserting Pedocs old data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Pedocs views updated to views_stats");
// Inserting TUDELFT views stats
logger.info("Inserting TUDELFT data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("TUDELFT views updated to views_stats");
// Inserting Lareferencia views stats
logger.info("Inserting LaReferencia data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("LaReferencia views updated to views_stats");
logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".downloads_stats " +
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET";
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats "
+ "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats table");
// Inserting OpenAIRE downloads stats
logger.info("Inserting OpenAIRE data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted OpenAIRE data to downloads_stats");
// Inserting Pedocs old downloads stats
logger.info("Inserting PeDocs old data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted Pedocs data to downloads_stats");
// Inserting TUDELFT downloads stats
logger.info("Inserting TUDELFT old data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted TUDELFT data to downloads_stats");
// Inserting Lareferencia downloads stats
logger.info("Inserting LaReferencia data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Lareferencia downloads updated to downloads_stats");
// Inserting IRUS downloads stats
logger.info("Inserting IRUS data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("IRUS downloads updated to downloads_stats");
// Inserting SARC-OJS downloads stats
logger.info("Inserting SARC data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("SARC-OJS downloads updated to downloads_stats");
logger.info("Creating pageviews_stats table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pageviews_stats " +
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET";
+ ".pageviews_stats "
+ "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table");
// Inserting OpenAIRE views stats from Portal
logger.info("Inserting data to page_views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropping full_dates table");
String dropFullDates = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".full_dates";
String dropFullDates = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".full_dates";
stmt.executeUpdate(dropFullDates);
logger.info("Dropped full_dates table");
@ -310,35 +479,80 @@ public class PiwikStatsDB {
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
logger.info("Creating full_dates table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " +
"SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " +
"FROM (SELECT DATE '2016-01-01' AS from_date) p " +
"LATERAL VIEW " +
"posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS "
+ "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date "
+ "FROM (SELECT DATE '2016-01-01' AS from_date) p "
+ "LATERAL VIEW "
+ "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
stmt.executeUpdate(sql);
logger.info("Created full_dates table");
logger.info("Inserting data to usage_stats");
sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " +
"SELECT coalesce(ds.source, vs.source) as source, " +
"coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
"coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
"coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
"coalesce(ds.openaire, 0) as openaire_downloads, " +
"coalesce(vs.openaire, 0) as openaire_views " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " +
ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " +
"AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS "
+ "SELECT coalesce(ds.source, vs.source) as source, "
+ "coalesce(ds.repository_id, vs.repository_id) as repository_id, "
+ "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
+ "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, "
+ "coalesce(ds.openaire, 0) as openaire_downloads, "
+ "coalesce(vs.openaire, 0) as openaire_views "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN "
+ ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source "
+ "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
stmt.executeUpdate(sql);
logger.info("Inserted data to usage_stats");
logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis()));
logger.info("Dropping view views_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view views_stats on permanent usagestats DB");
logger.info("Create view views_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
logger.info("Created view views_stats on permanent usagestats DB");
logger.info("Dropping view pageviews_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view pageviews_stats on permanent usagestats DB");
logger.info("Create view pageviews_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
logger.info("Created view pageviews_stats on permanent usagestats DB");
logger.info("Dropping view downloads_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on downloads_stats on permanent usagestats DB");
logger.info("Create view on downloads_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
logger.info("Created view on downloads_stats on permanent usagestats DB");
logger.info("Dropping view usage_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on usage_stats on permanent usagestats DB");
logger.info("Create view on usage_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
logger.info("Created view on usage_stats on permanent usagestats DB");
logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis()));
stmt.close();
ConnectDB.getHiveConnection().close();
}
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.IOException;
@ -41,8 +42,16 @@ public class UsageStatsExporter {
// logger.info("TmpTables are created ");
// }
if (ExecuteWorkflow.processPiwikLogs) {
logger.info("Processing logs");
logger.info("Processing Piwik logs");
piwikstatsdb.processLogs();
logger.info("Piwik logs Done");
logger.info("Processing Pedocs Old Stats");
piwikstatsdb.uploadOldPedocs();
logger.info("Processing Pedocs Old Stats Done");
logger.info("Processing TUDELFT Stats");
piwikstatsdb.uploadTUDELFTStats();
logger.info("Processing TUDELFT Stats Done");
}
LaReferenciaStats lastats = new LaReferenciaStats();
@ -100,6 +109,18 @@ public class UsageStatsExporter {
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
}

View File

@ -1,16 +1,4 @@
[
{
"paramName": "mat",
"paramLongName": "matomoAuthToken",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "mbu",
"paramLongName": "matomoBaseURL",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "rlp",
"paramLongName": "repoLogPath",
@ -29,12 +17,6 @@
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "iukbuw",
"paramLongName": "irusUKBaseURL",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "iukrp",
"paramLongName": "irusUKReportPath",
@ -59,18 +41,6 @@
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "lbu",
"paramLongName": "lareferenciaBaseURL",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "lat",
"paramLongName": "lareferenciaAuthToken",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "dbhu",
"paramLongName": "dbHiveUrl",
@ -102,15 +72,15 @@
"paramRequired": true
},
{
"paramName": "rdbt",
"paramLongName": "recreateDbAndTables",
"paramDescription": "Re-create database and initial tables?",
"paramName": "uspdbs",
"paramLongName": "usagestatsPermanentDBSchema",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "pwed",
"paramLongName": "piwikEmptyDirs",
"paramDescription": "Empty piwik directories?",
"paramName": "rdbt",
"paramLongName": "recreateDbAndTables",
"paramDescription": "Re-create database and initial tables?",
"paramRequired": true
},
{
@ -119,103 +89,24 @@
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true
},
{
"paramName": "dpwl",
"paramLongName": "downloadPiwikLogs",
"paramDescription": "download piwik logs?",
"paramRequired": true
},
{
"paramName": "slp",
"paramLongName": "startingLogPeriod",
"paramDescription": "Starting log period",
"paramRequired": true
},
{
"paramName": "elp",
"paramLongName": "endingLogPeriod",
"paramDescription": "Ending log period",
"paramRequired": true
},
{
"paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload",
"paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
"paramRequired": true
},
{
"paramName": "nsidd",
"paramLongName": "numberOfSiteIdsToDownload",
"paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
"paramRequired": true
},
{
"paramName": "lerd",
"paramLongName": "laReferenciaEmptyDirs",
"paramDescription": "Empty LaReferencia directories?",
"paramRequired": true
},
{
"paramName": "plrl",
"paramLongName": "processLaReferenciaLogs",
"paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true
},
{
"paramName": "dlrl",
"paramLongName": "downloadLaReferenciaLogs",
"paramDescription": "download La Referencia logs?",
"paramRequired": true
},
{
"paramName": "icted",
"paramLongName": "irusCreateTablesEmptyDirs",
"paramDescription": "Irus section: Create tables and empty JSON directories?",
"paramRequired": true
},
{
"paramName": "idr",
"paramLongName": "irusDownloadReports",
"paramDescription": "Irus section: Download reports?",
"paramRequired": true
},
{
"paramName": "ipr",
"paramLongName": "irusProcessStats",
"paramDescription": "Irus section: Process stats?",
"paramRequired": true
},
{
"paramName": "inod",
"paramLongName": "irusNumberOfOpendoarsToDownload",
"paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload",
"paramRequired": true
},
{
"paramName": "icted",
"paramLongName": "sarcCreateTablesEmptyDirs",
"paramDescription": "Sarc section: Create tables and empty JSON directories?",
"paramRequired": true
},
{
"paramName": "idr",
"paramLongName": "sarcDownloadReports",
"paramDescription": "Sarc section: Download reports?",
"paramRequired": true
},
{
"paramName": "ipr",
"paramLongName": "sarcProcessStats",
"paramDescription": "Sarc section: Process stats?",
"paramRequired": true
},
{
"paramName": "inod",
"paramLongName": "sarcNumberOfIssnToDownload",
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
"paramRequired": true
},
{
"paramName": "fs",
"paramLongName": "finalizeStats",

View File

@ -42,42 +42,24 @@
<action name='Step1'>
<java>
<main-class>eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow</main-class>
<arg>--matomoAuthToken</arg><arg>${matomoAuthToken}</arg>
<arg>--matomoBaseURL</arg><arg>${matomoBaseURL}</arg>
<arg>--repoLogPath</arg><arg>${repoLogPath}</arg>
<arg>--portalLogPath</arg><arg>${portalLogPath}</arg>
<arg>--portalMatomoID</arg><arg>${portalMatomoID}</arg>
<arg>--irusUKBaseURL</arg><arg>${irusUKBaseURL}</arg>
<arg>--irusUKReportPath</arg><arg>${irusUKReportPath}</arg>
<arg>--sarcsReportPathArray</arg><arg>${sarcsReportPathArray}</arg>
<arg>--sarcsReportPathNonArray</arg><arg>${sarcsReportPathNonArray}</arg>
<arg>--lareferenciaLogPath</arg><arg>${lareferenciaLogPath}</arg>
<arg>--lareferenciaBaseURL</arg><arg>${lareferenciaBaseURL}</arg>
<arg>--lareferenciaAuthToken</arg><arg>${lareferenciaAuthToken}</arg>
<arg>--dbHiveUrl</arg><arg>${hiveJdbcUrl}</arg>
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
<arg>--usageRawDataDBSchema</arg><arg>${usageRawDataDBSchema}</arg>
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
<arg>--usagestatsPermanentDBSchema</arg><arg>${usagestatsPermanentDBSchema}</arg>
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
<arg>--piwikEmptyDirs</arg><arg>${piwikEmptyDirs}</arg>
<arg>--downloadPiwikLogs</arg><arg>${downloadPiwikLogs}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--laReferenciaEmptyDirs</arg><arg>${laReferenciaEmptyDirs}</arg>
<arg>--downloadLaReferenciaLogs</arg><arg>${downloadLaReferenciaLogs}</arg>
<arg>--processLaReferenciaLogs</arg><arg>${processLaReferenciaLogs}</arg>
<arg>--irusCreateTablesEmptyDirs</arg><arg>${irusCreateTablesEmptyDirs}</arg>
<arg>--irusDownloadReports</arg><arg>${irusDownloadReports}</arg>
<arg>--irusProcessStats</arg><arg>${irusProcessStats}</arg>
<arg>--irusNumberOfOpendoarsToDownload</arg><arg>${irusNumberOfOpendoarsToDownload}</arg>
<arg>--sarcCreateTablesEmptyDirs</arg><arg>${sarcCreateTablesEmptyDirs}</arg>
<arg>--sarcDownloadReports</arg><arg>${sarcDownloadReports}</arg>
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>