Commit 12122020

This commit is contained in:
Dimitris 2020-12-12 12:00:14 +02:00
parent bbcf6b7c8b
commit dc9c2f3272
33 changed files with 3306 additions and 3022 deletions

View File

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<project-shared-configuration>
<!--
This file contains additional configuration written by modules in the NetBeans IDE.
The configuration is intended to be shared among all the users of project and
therefore it is assumed to be part of version control checkout.
Without this configuration present, some functionality in the IDE may be limited or fail altogether.
-->
<properties xmlns="http://www.netbeans.org/ns/maven-properties-data/1">
<!--
Properties that influence various parts of the IDE, especially code formatting and the like.
You can copy and paste the single properties, into the pom.xml file and the IDE will pick them up.
That way multiple projects can share the same settings (useful for formatting rules for example).
Any value defined here will override the pom.xml file value but is only applicable to the current project.
-->
<netbeans.hint.jdkPlatform>JDK_1.8</netbeans.hint.jdkPlatform>
</properties>
</project-shared-configuration>

View File

@ -23,7 +23,35 @@
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-datasets-stats-update</artifactId> <artifactId>dhp-usage-datasets-stats-update</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
@ -68,6 +96,11 @@
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.mchange</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.5.2</version>
</dependency>
<dependency> <dependency>
<groupId>c3p0</groupId> <groupId>c3p0</groupId>
<artifactId>c3p0</artifactId> <artifactId>c3p0</artifactId>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats

View File

@ -32,8 +32,8 @@ public abstract class ConnectDB {
private static String datasetUsageStatsDBSchema; private static String datasetUsageStatsDBSchema;
private static String statsDBSchema; private static String statsDBSchema;
private final static Logger logger = Logger.getLogger(ConnectDB.class); private final static Logger logger = Logger.getLogger(ConnectDB.class);
private Statement stmt = null; private Statement stmt = null;
static void init() throws ClassNotFoundException { static void init() throws ClassNotFoundException {
dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
@ -79,6 +79,7 @@ public abstract class ConnectDB {
*/ */
ComboPooledDataSource cpds = new ComboPooledDataSource(); ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbHiveUrl); cpds.setJdbcUrl(dbHiveUrl);
cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1); cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100); cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1); cpds.setMinPoolSize(1);
@ -93,10 +94,10 @@ public abstract class ConnectDB {
cpds.setCheckoutTimeout(0); cpds.setCheckoutTimeout(0);
cpds.setPreferredTestQuery("SELECT 1"); cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60); cpds.setIdleConnectionTestPeriod(60);
logger.info("Opened database successfully");
return cpds.getConnection(); logger.info("Opened database successfully");
return cpds.getConnection();
} }
@ -107,6 +108,7 @@ public abstract class ConnectDB {
*/ */
ComboPooledDataSource cpds = new ComboPooledDataSource(); ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbImpalaUrl); cpds.setJdbcUrl(dbImpalaUrl);
cpds.setUser("dimitris.pierrakos");
cpds.setAcquireIncrement(1); cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100); cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1); cpds.setMinPoolSize(1);
@ -122,81 +124,8 @@ public abstract class ConnectDB {
cpds.setPreferredTestQuery("SELECT 1"); cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60); cpds.setIdleConnectionTestPeriod(60);
logger.info("Opened database successfully"); logger.info("Opened database successfully");
return cpds.getConnection(); return cpds.getConnection();
} }
private void createDatabase() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Piwiklog table - This table should exist
String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) "
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLog);
/////////////////////////////////////////
// Rule for duplicate inserts @ piwiklog
/////////////////////////////////////////
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePortalLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log
//////////////////////////////////////////////////
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
} }
/*
CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING,
name STRING,
source STRING,
release STRING,
createdby STRING,
report_end_date STRING,
report_start_date STRING)
CLUSTERED BY (reportid)
into 100 buckets stored as orc tblproperties('transactional'='true');
*/

View File

@ -0,0 +1,168 @@
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author D. Pierrakos, S. Zoupanos
*/
public class DatasetsStatsDB {
private String logPath;
private String logRepoPath;
private String logPortalPath;
private Statement stmt = null;
private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class);
private String CounterRobotsURL;
private ArrayList robotsList;
public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception {
this.logRepoPath = logRepoPath;
this.logPortalPath = logPortalPath;
}
public void recreateDBAndTables() throws Exception {
this.createDatabase();
this.createTables();
}
// public void reCreateLogDirs() throws IllegalArgumentException, IOException {
// FileSystem dfs = FileSystem.get(new Configuration());
//
// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
//
// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
//
// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
//
// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
// }
public ArrayList getRobotsList() {
return robotsList;
}
public void setRobotsList(ArrayList robotsList) {
this.robotsList = robotsList;
}
public String getCounterRobotsURL() {
return CounterRobotsURL;
}
public void setCounterRobotsURL(String CounterRobotsURL) {
this.CounterRobotsURL = CounterRobotsURL;
}
private void createDatabase() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Reports table - This table should exist
logger.info("Creating Reports Table");
String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports(reportid STRING, \n"
+ " name STRING, \n"
+ " source STRING,\n"
+ " release STRING,\n"
+ " createdby STRING,\n"
+ " report_start_date STRING,\n"
+ " report_end_date STRING)\n"
+ " CLUSTERED BY (reportid)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataciteReports);
logger.info("Reports Table Created");
// Create Datasets Table
logger.info("Creating DataSets Table");
String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datasets(ds_type STRING,\n"
+ " ds_title STRING,\n"
+ " yop STRING,\n"
+ " uri STRING,\n"
+ " platform STRING,\n"
+ " data_type STRING,\n"
+ " publisher STRING,\n"
+ " publisher_id_type STRING,\n"
+ " publisher_id_value STRING,\n"
+ " ds_dates_type STRING,\n"
+ " ds_pub_date STRING,\n"
+ " ds_contributors STRING,\n"
// + " ds_contributor_value array <STRING>,\n"
+ " reportid STRING)\n"
+ " CLUSTERED BY (ds_type)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataSets);
logger.info("DataSets Table Created");
// Create Datasets Performance Table
logger.info("Creating DataSetsPerformance Table");
String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datasetsperformance(ds_type STRING,\n"
+ " period_end STRING,\n"
+ " period_from STRING,\n"
+ " access_method STRING,\n"
+ " metric_type STRING,\n"
+ " count INT,\n"
+ " country_counts STRING,\n"
+ " reportid STRING)\n"
+ " CLUSTERED BY (ds_type)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataSetsPerformance);
logger.info("DataSetsPerformance Table Created");
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection();
}
}

View File

@ -1,97 +1,102 @@
/* /*
* To change this license header, choose License Headers in Project Properties. * To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates * To change this template file, choose Tools | Templates
* and open the template in the editor. * and open the template in the editor.
*/ */
package eu.dnetlib.oa.graph.datasetsusagestats.export;
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import com.google.gson.Gson;
import com.google.gson.JsonArray; import java.io.BufferedInputStream;
import com.google.gson.JsonElement; import java.io.BufferedReader;
import java.io.BufferedInputStream; import java.io.IOException;
import java.io.BufferedReader; import java.io.InputStreamReader;
import java.io.IOException; import java.net.MalformedURLException;
import java.io.InputStreamReader; import java.net.URL;
import java.net.MalformedURLException; import java.util.ArrayList;
import java.net.URL; import java.util.Iterator;
import com.google.gson.JsonObject;
import java.util.ArrayList; import org.apache.hadoop.conf.Configuration;
import java.util.Iterator; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem; import org.json.simple.parser.ParseException;
import org.apache.hadoop.fs.Path; import org.slf4j.Logger;
import org.json.simple.parser.ParseException; import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import com.google.gson.Gson;
import com.google.gson.JsonArray;
/** import com.google.gson.JsonElement;
* import com.google.gson.JsonObject;
* @author dpie
*/ /**
public class DownloadReportsListFromDatacite { * @author dpie
*/
private String dataciteBaseURL; public class DownloadReportsListFromDatacite {
private String dataciteReportPath;
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); private String dataciteBaseURL;
private String dataciteReportPath;
public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception { private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
this.dataciteBaseURL = dataciteBaseURL; public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath)
this.dataciteReportPath = dataciteReportPath; throws MalformedURLException, Exception {
}
this.dataciteBaseURL = dataciteBaseURL;
public void downloadReportsList() throws ParseException { this.dataciteReportPath = dataciteReportPath;
StringBuilder responseStrBuilder = new StringBuilder(); }
Gson gson = new Gson(); public void downloadReportsList() throws ParseException {
StringBuilder responseStrBuilder = new StringBuilder();
try {
BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); Gson gson = new Gson();
logger.info("Downloading from " + dataciteBaseURL);
try {
BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream());
String inputStr; logger.info("Downloading from " + dataciteBaseURL);
while ((inputStr = streamReader.readLine()) != null) { BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
responseStrBuilder.append(inputStr); String inputStr;
}
} catch (IOException e) { while ((inputStr = streamReader.readLine()) != null) {
logger.info(e.getMessage()); responseStrBuilder.append(inputStr);
} }
JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); } catch (IOException e) {
JsonArray dataArray = jsonObject.getAsJsonArray("reports"); logger.info(e.getMessage());
ArrayList reportsList = new ArrayList(); }
for (JsonElement element : dataArray) { JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class);
reportsList.add(element.getAsJsonObject().get("id").getAsString()); JsonArray dataArray = jsonObject.getAsJsonArray("reports");
} ArrayList reportsList = new ArrayList();
for (JsonElement element : dataArray) {
Iterator it = reportsList.iterator(); reportsList.add(element.getAsJsonObject().get("id").getAsString());
while (it.hasNext()) { }
String reportId = it.next().toString();
String url = dataciteBaseURL + reportId; Iterator it = reportsList.iterator();
while (it.hasNext()) {
try { String reportId = it.next().toString();
BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); String url = dataciteBaseURL + reportId;
BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
String inputStr; try {
StringBuilder responseStrBuilder2 = new StringBuilder(); BufferedInputStream in = new BufferedInputStream(new URL(url).openStream());
while ((inputStr = streamReader.readLine()) != null) { BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
responseStrBuilder2.append(inputStr); String inputStr;
} StringBuilder responseStrBuilder2 = new StringBuilder();
FileSystem fs = FileSystem.get(new Configuration()); while ((inputStr = streamReader.readLine()) != null) {
FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"), responseStrBuilder2.append(inputStr);
true); }
byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); FileSystem fs = FileSystem.get(new Configuration());
fin.write(jsonObjectRawBytes); FSDataOutputStream fin = fs
fin.writeChar('\n'); .create(
new Path(dataciteReportPath + "/" + reportId + ".json"),
fin.close(); true);
byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes();
fin.close(); fin.write(jsonObjectRawBytes);
} catch (IOException e) { fin.writeChar('\n');
System.out.println(e);
} fin.close();
}
} fin.close();
} } catch (IOException e) {
System.out.println(e);
}
}
}
}

View File

@ -18,14 +18,13 @@ public class ExecuteWorkflow {
static String dataciteBaseURL; static String dataciteBaseURL;
static String dataciteReportPath; static String dataciteReportPath;
static String dbHiveUrl; static String dbHiveUrl;
static String dbImpalaUrl; static String dbImpalaUrl;
static String datasetUsageStatsDBSchema; static String datasetUsageStatsDBSchema;
static String statsDBSchema; static String statsDBSchema;
static boolean recreateDbAndTables; static boolean recreateDbAndTables;
static boolean datasetsEmptyDirs; static boolean datasetsEmptyDirs;
static boolean finalTablesVisibleToImpala; static boolean finalTablesVisibleToImpala;
public static void main(String args[]) throws Exception { public static void main(String args[]) throws Exception {
@ -58,11 +57,11 @@ public class ExecuteWorkflow {
else else
datasetsEmptyDirs = false; datasetsEmptyDirs = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) // if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true"))
finalTablesVisibleToImpala = true; // finalTablesVisibleToImpala = true;
else // else
finalTablesVisibleToImpala = false; // finalTablesVisibleToImpala = false;
//
UsageStatsExporter usagestatsExport = new UsageStatsExporter(); UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export(); usagestatsExport.export();
} }

View File

@ -0,0 +1,408 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.sql.Array;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
/**
* @author dpie
*/
public class ReadReportsListFromDatacite {
private String dataciteReportPath;
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception {
this.dataciteReportPath = dataciteReportPath;
}
public void readReports() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
File folder = new File(dataciteReportPath);
ArrayList<String> jsonFiles = listHdfsDir(dataciteReportPath);
for (String jsonFile : jsonFiles) {
logger.info("Reading report file " + jsonFile);
this.createTmpReportsTable(jsonFile);
String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelectReportID);
ResultSet rstmpReportID = stmt.getResultSet();
String reportID = null;
while (rstmpReportID.next()) {
reportID = rstmpReportID.getString(1);
}
logger.info("Checking report with id " + reportID);
String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports where reportid=?";
PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists);
stGetReportID.setString(1, reportID);
ResultSet rsCheckIfReportExist = stGetReportID.executeQuery();
if (rsCheckIfReportExist.next()) {
logger.info("Report found with ID " + reportID);
dropTmpReportsTable();
} else {
String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datacitereports "
+ "SELECT\n"
+ " get_json_object(json, '$.report.id') AS reportid,\n"
+ " get_json_object(json, '$.report.report-header.report-name') AS name,\n"
+ " get_json_object(json, '$.report.report-header.report-id') AS source,\n"
+ " get_json_object(json, '$.report.report-header.release') AS release,\n"
+ " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n"
+ " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n"
+ " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertReport);
logger.info("Report added");
logger.info("Adding datasets");
String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteDatasetsArray);
ResultSet rstmpReportDatasets = stmt.getResultSet();
if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) {
String[] listDatasets = rstmpReportDatasets.getString(1).split(",");
logger.info("Datasets found " + listDatasets.length);
for (int i = 0; i < listDatasets.length; i++) {
String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datasets "
+ "SELECT\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-id[0].value') AS ds_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-title') AS ds_title,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].publisher-id.type[0]') AS publisher_id_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].publisher-id.value[0]') AS publisher_id_value,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-dates.type[0]') AS ds_dates_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-dates.value[0]') AS ds_dates_value,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-contributors') AS ds_contributors,\n"
+ " get_json_object(json, '$.report.id') AS reportid \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertDataSets);
logger.info("Dataset added " + i);
logger.info("Adding Dataset Performance");
String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets["
+ i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteDatasetsPerformance);
ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet();
if (rstmpReportDatasetsPerformance.next()
&& rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) {
String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(",");
logger.info("Datasets Performance found " + listDatasetsPerformance.length);
for (int j = 0; j < listDatasetsPerformance.length; j++) {
String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets["
+ i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".tmpjson";
stmt.execute(sqlSelecteDatasetsPerformanceInstance);
ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet();
if (rstmpReportDatasetsPerformanceInstance.next()
&& rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) {
String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance
.getString(1)
.split(",");
logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length);
for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) {
String sqlInsertDataSetsPerformance = "INSERT INTO "
+ ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance "
+ "SELECT\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].dataset-id[0].value') AS ds_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].period.end-date') AS period_end,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].period.begin-date') AS period_from,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].access-method') AS access_method,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].metric-type') AS metric_type,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k + "].count') AS count,\n"
+ " get_json_object(json, '$.report.report-datasets[" + i
+ "].performance[" + j + "].instance[" + k
+ "].country-counts') AS country_counts,\n"
+ " get_json_object(json, '$.report.id') AS reportid \n"
+ "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlInsertDataSetsPerformance);
}
}
}
}
logger.info("DatasetPerformance added for dataset" + i);
}
}
logger.info("Adding gzip performance");
String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(sqlSelecteReportSubsets);
ResultSet rstmpReportSubsets = stmt.getResultSet();
if (rstmpReportSubsets.next()) {
String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1));
this.readCompressedReport(unCompressedReport, reportID);
}
}
}
this.dropTmpReportsTable();
}
public void readCompressedReport(String report, String reportId) throws Exception {
Gson gson = new Gson();
JsonObject jsonObject = gson.fromJson(report, JsonObject.class);
JsonArray jsonReportDatasets;
if (jsonObject.getAsJsonArray("report_datasets") != null) {
jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets");
} else {
jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets");
}
for (JsonElement datasetElement : jsonReportDatasets) {
// JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title");
String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString();
String yop = datasetElement.getAsJsonObject().get("yop").getAsString();
String uri = datasetElement.getAsJsonObject().get("uri").getAsString();
String platform = datasetElement.getAsJsonObject().get("platform").getAsString();
String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString();
String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString();
JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id");
String publisher_id_type = "";
String publisher_id_value = "";
for (JsonElement publisher_id_Element : publisher_id) {
publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString();
publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString();
}
JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates");
String ds_dates_type = "";
String ds_dates_value = "";
for (JsonElement datasetDaysElement : dataset_days) {
ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString();
ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString();
}
JsonArray datasetContributors = null;
String ds_contributor_type = "";
String[] ds_contributor_values = null;
Array ds_contributor_valuesArr = null;
if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) {
datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors");
JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id");
String doi = "";
for (JsonElement datasetIDElement : datasetid)
//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString());
{
doi = datasetIDElement.getAsJsonObject().get("value").getAsString();
}
String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema()
+ " .datasets(ds_type,"
+ "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value,"
+ "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) ";
PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset);
pstmtDataset.setString(1, doi);
pstmtDataset.setString(2, dataset_title);
pstmtDataset.setString(3, yop);
pstmtDataset.setString(4, uri);
pstmtDataset.setString(5, platform);
pstmtDataset.setString(6, data_type);
pstmtDataset.setString(7, publisher);
pstmtDataset.setString(8, publisher_id_type);
pstmtDataset.setString(9, publisher_id_value);
pstmtDataset.setString(10, ds_dates_type);
pstmtDataset.setString(11, ds_dates_value);
pstmtDataset.setString(13, datasetContributors.getAsString());
pstmtDataset.setString(14, reportId);
pstmtDataset.execute();
logger.info("Dataset from compressed report addded " + doi);
/*
* JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for
* (JsonElement performanceElement : performance) { JsonObject period =
* performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date =
* period.getAsJsonObject().get("end-date").getAsString(); String begin_date =
* period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance =
* performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement :
* instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject
* country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set<String>
* keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[]
* country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0;
* while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] =
* country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text",
* country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype
* = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod =
* instanceElement.getAsJsonObject().get("access-method").getAsString(); String
* sqlInsertDatasetPerformance =
* "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)"
* ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance);
* //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count);
* pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date);
* pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod);
* pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count);
* pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8,
* countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute();
* } }
*/
}
}
}
private ArrayList<String> listHdfsDir(String dir) throws Exception {
FileSystem hdfs = FileSystem.get(new Configuration());
RemoteIterator<LocatedFileStatus> Files;
ArrayList<String> fileNames = new ArrayList<>();
try {
Path exportPath = new Path(hdfs.getUri() + dir);
Files = hdfs.listFiles(exportPath, false);
while (Files.hasNext()) {
String fileName = Files.next().getPath().toString();
fileNames.add(fileName);
}
hdfs.close();
} catch (Exception e) {
logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir));
throw new Exception("HDFS file path with exported data does not exist : " + dir, e);
}
return fileNames;
}
private String readHDFSFile(String filename) throws Exception {
String result;
try {
FileSystem fs = FileSystem.get(new Configuration());
// log.info("reading file : " + filename);
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while (line != null) {
sb.append(line);
// sb.append(line);
line = br.readLine();
}
// result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
result = sb.toString().trim();
// fs.close();
} catch (Exception e) {
throw new Exception(e);
}
return result;
}
public static String uncompressString(String zippedBase64Str)
throws IOException {
String result = null;
// In my solr project, I use org.apache.solr.common.util.Base64.
// byte[] bytes =
// org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str);
byte[] bytes = Base64.getDecoder().decode(zippedBase64Str);
GZIPInputStream zi = null;
try {
zi = new GZIPInputStream(new ByteArrayInputStream(bytes));
result = IOUtils.toString(zi);
} finally {
IOUtils.closeQuietly(zi);
}
return result;
}
private void createTmpReportsTable(String jsonFile) throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
dropTmpReportsTable();
String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema()
+ ".tmpjson (json STRING)";
stmt.executeUpdate(createTmpTable);
logger.info("Tmp Table Created");
String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE "
+ ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
stmt.execute(insertJsonReport);
logger.info("JSON Report File inserted to tmpjson Table");
}
private void dropTmpReportsTable() throws SQLException {
logger.info("Dropping tmpjson Table");
String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson";
Statement stmt = ConnectDB.getHiveConnection().createStatement();
stmt.executeUpdate(dropTmpTable);
logger.info("Dropped tmpjson Table");
}
}
/*
* PreparedStatement prepStatem = conn.
* prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)"
* );
*/

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.datasetsusagestats.export; package eu.dnetlib.oa.graph.datasetsusagestats.export;
import java.io.IOException; import java.io.IOException;
@ -17,220 +18,94 @@ import org.slf4j.LoggerFactory;
*/ */
public class UsageStatsExporter { public class UsageStatsExporter {
private Statement stmt = null; private Statement stmt = null;
public UsageStatsExporter() { public UsageStatsExporter() {
} }
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
private void reCreateLogDirs() throws IllegalArgumentException, IOException { private void reCreateLogDirs() throws IllegalArgumentException, IOException {
FileSystem dfs = FileSystem.get(new Configuration()); FileSystem dfs = FileSystem.get(new Configuration());
logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath);
dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true);
logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath);
dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath));
} }
public void export() throws Exception { public void export() throws Exception {
logger.info("Initialising DB properties"); logger.info("Initialising DB properties");
ConnectDB.init(); ConnectDB.init();
ConnectDB.getHiveConnection(); ConnectDB.getHiveConnection();
if (ExecuteWorkflow.recreateDbAndTables) { if (ExecuteWorkflow.recreateDbAndTables) {
createDatabase(); DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", "");
createTables(); datasetsDB.recreateDBAndTables();
reCreateLogDirs(); }
} logger.info("Initializing the download logs module");
logger.info("Initializing the download logs module"); DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL,
DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath); ExecuteWorkflow.dataciteReportPath);
if (ExecuteWorkflow.datasetsEmptyDirs) { if (ExecuteWorkflow.datasetsEmptyDirs) {
logger.info("Downloading Reports List From Datacite"); logger.info("Downloading Reports List From Datacite");
drfd.downloadReportsList(); drfd.downloadReportsList();
logger.info("Reports List has been downloaded"); logger.info("Reports List has been downloaded");
} }
}
private void createDatabase() throws Exception { ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite(
try { ExecuteWorkflow.dataciteReportPath);
stmt = ConnectDB.getHiveConnection().createStatement(); logger.info("Store Reports To DB");
readReportsListFromDatacite.readReports();
logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); logger.info("Reports Stored To DB");
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; }
stmt.executeUpdate(dropDatabase);
} catch (Exception e) {
logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e);
}
try {
stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
} catch (Exception e) {
logger.error("Failed to create database: " + e);
throw new Exception("Failed to create database: " + e.toString(), e);
}
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getHiveConnection().createStatement();
// Create Reports table - This table should exist
String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getDataSetUsageStatsDBSchema()
+ ".datacitereports(reportid STRING, \n"
+ " name STRING, \n"
+ " source STRING,\n"
+ " release STRING,\n"
+ " createdby STRING,\n"
+ " report_end_date STRING,\n"
+ " report_start_date STRING)\n"
+ " CLUSTERED BY (reportid)\n"
+ " into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableDataciteeReports);
stmt.close();
ConnectDB.getHiveConnection().close();
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
// runImpalaQuery(); // runImpalaQuery();
/* /*
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
* logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module");
logger.info("Re-creating database and tables"); * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
* if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories");
logger.info("Initializing the download logs module"); * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs(
* ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); }
if (ExecuteWorkflow.piwikEmptyDirs) { * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl =
logger.info("Recreating Piwik log directories"); * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
piwikstatsdb.reCreateLogDirs(); * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) {
} * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables");
* LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
// Downloading piwik logs (also managing directory creation) * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) {
if (ExecuteWorkflow.downloadPiwikLogs) { * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if
logger.info("Downloading piwik logs"); * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs");
piwd * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); }
.GetOpenAIRELogs( * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if
ExecuteWorkflow.repoLogPath, * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs();
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if
} * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables");
logger.info("Downloaded piwik logs"); * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs");
* irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) {
// Create DB tables, insert/update statistics * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) {
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if
piwikstatsdb.setCounterRobotsURL(cRobotsUrl); * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if
* (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray,
if (ExecuteWorkflow.processPiwikLogs) { * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) {
logger.info("Processing logs"); * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
piwikstatsdb.processLogs(); * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if
} * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the
* tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Creating LaReferencia tables"); * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End");
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, */
ExecuteWorkflow.lareferenciaAuthToken);
if (ExecuteWorkflow.laReferenciaEmptyDirs) {
logger.info("Recreating LaReferencia log directories");
lrf.reCreateLogDirs();
}
if (ExecuteWorkflow.downloadLaReferenciaLogs) {
logger.info("Downloading LaReferencia logs");
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs");
}
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
if (ExecuteWorkflow.processLaReferenciaLogs) {
logger.info("Processing LaReferencia logs");
lastats.processLogs();
logger.info("LaReferencia logs done");
}
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
logger.info("Creating Irus Stats tables");
irusstats.createTables();
logger.info("Created Irus Stats tables");
logger.info("Re-create log dirs");
irusstats.reCreateLogDirs();
logger.info("Re-created log dirs");
}
if (ExecuteWorkflow.irusDownloadReports) {
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
}
if (ExecuteWorkflow.irusProcessStats) {
irusstats.processIrusStats();
logger.info("Irus done");
}
SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
sarcStats.reCreateLogDirs();
}
if (ExecuteWorkflow.sarcDownloadReports) {
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
}
if (ExecuteWorkflow.sarcProcessStats) {
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
sarcStats.finalizeSarcStats();
}
logger.info("Sarc done");
// finalize usagestats
if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats();
logger.info("Finalized stats");
}
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
logger.info("End");
*/
} }
/* /*
private void invalidateMetadata() throws SQLException { * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt =
Statement stmt = null; * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " +
* ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA "
stmt = ConnectDB.getImpalaConnection().createStatement(); * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
* ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " +
String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close();
stmt.executeUpdate(sql); * ConnectDB.getHiveConnection().close(); }
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
}
*/ */

View File

@ -2,13 +2,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<!-- <parent> <!-- <parent>
<artifactId>dhp-workflows</artifactId > <artifactId>dhp-workflows</artifactId >
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.7-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<groupId>eu.dnetlib</groupId> --> <groupId>eu.dnetlib</groupId> -->
<!-- <parent> <!-- <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.1.7-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
@ -23,14 +23,42 @@
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-raw-data-update</artifactId> <artifactId>dhp-usage-raw-data-update</artifactId>
<build>
<properties> <plugins>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <plugin>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version> <cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version> <cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
</properties> </properties>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
@ -53,16 +81,16 @@
<version>20180130</version> <version>20180130</version>
<type>jar</type> <type>jar</type>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hive</groupId> <groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId> <artifactId>hive-jdbc</artifactId>
<version>${cdh.hive.version}</version> <version>${cdh.hive.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
<version>${cdh.hadoop.version}</version> <version>${cdh.hadoop.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata

View File

@ -122,4 +122,4 @@ public abstract class ConnectDB {
} }
} }

View File

@ -62,7 +62,6 @@ public class ExecuteWorkflow {
static int sarcNumberOfIssnToDownload; static int sarcNumberOfIssnToDownload;
static boolean finalizeStats; static boolean finalizeStats;
static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads; static int numberOfDownloadThreads;
@ -98,98 +97,108 @@ public class ExecuteWorkflow {
usageStatsDBSchema = parser.get("usageStatsDBSchema"); usageStatsDBSchema = parser.get("usageStatsDBSchema");
statsDBSchema = parser.get("statsDBSchema"); statsDBSchema = parser.get("statsDBSchema");
if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
recreateDbAndTables = true; recreateDbAndTables = true;
else } else {
recreateDbAndTables = false; recreateDbAndTables = false;
}
if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) {
piwikEmptyDirs = true; piwikEmptyDirs = true;
else } else {
piwikEmptyDirs = false; piwikEmptyDirs = false;
}
if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) {
downloadPiwikLogs = true; downloadPiwikLogs = true;
else } else {
downloadPiwikLogs = false; downloadPiwikLogs = false;
}
if (parser.get("processPiwikLogs").toLowerCase().equals("true")) if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
processPiwikLogs = true; processPiwikLogs = true;
else } else {
processPiwikLogs = false; processPiwikLogs = false;
}
String startingLogPeriodStr = parser.get("startingLogPeriod"); String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
String endingLogPeriodStr = parser.get("endingLogPeriod"); // String endingLogPeriodStr = parser.get("endingLogPeriod");
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); // Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); // endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) {
laReferenciaEmptyDirs = true; laReferenciaEmptyDirs = true;
else } else {
laReferenciaEmptyDirs = false; laReferenciaEmptyDirs = false;
}
if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) {
downloadLaReferenciaLogs = true; downloadLaReferenciaLogs = true;
else } else {
downloadLaReferenciaLogs = false; downloadLaReferenciaLogs = false;
}
if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
processLaReferenciaLogs = true; processLaReferenciaLogs = true;
else } else {
processLaReferenciaLogs = false; processLaReferenciaLogs = false;
}
if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) {
irusCreateTablesEmptyDirs = true; irusCreateTablesEmptyDirs = true;
else } else {
irusCreateTablesEmptyDirs = false; irusCreateTablesEmptyDirs = false;
}
if (parser.get("irusDownloadReports").toLowerCase().equals("true")) if (parser.get("irusDownloadReports").toLowerCase().equals("true")) {
irusDownloadReports = true; irusDownloadReports = true;
else } else {
irusDownloadReports = false; irusDownloadReports = false;
}
if (parser.get("irusProcessStats").toLowerCase().equals("true")) if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
irusProcessStats = true; irusProcessStats = true;
else } else {
irusProcessStats = false; irusProcessStats = false;
}
irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) {
sarcCreateTablesEmptyDirs = true; sarcCreateTablesEmptyDirs = true;
else } else {
sarcCreateTablesEmptyDirs = false; sarcCreateTablesEmptyDirs = false;
}
if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) {
sarcDownloadReports = true; sarcDownloadReports = true;
else } else {
sarcDownloadReports = false; sarcDownloadReports = false;
}
if (parser.get("sarcProcessStats").toLowerCase().equals("true")) if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
sarcProcessStats = true; sarcProcessStats = true;
else } else {
sarcProcessStats = false; sarcProcessStats = false;
}
sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
/* if (parser.get("finalizeStats").toLowerCase().equals("true")) {
if (parser.get("finalizeStats").toLowerCase().equals("true"))
finalizeStats = true; finalizeStats = true;
else } else {
finalizeStats = false; finalizeStats = false;
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) }
finalTablesVisibleToImpala = true;
else
finalTablesVisibleToImpala = false;
*/
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
UsageStatsExporter usagestatsExport = new UsageStatsExporter(); UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export(); usagestatsExport.export();
// usagestatsExport.createdDBWithTablesOnly();
} }
private static Calendar startingLogPeriodStr(Date date) { private static Calendar startingLogPeriodStr(Date date) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export; package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*; import java.io.*;
@ -27,393 +28,331 @@ import org.slf4j.LoggerFactory;
*/ */
public class IrusStats { public class IrusStats {
private String irusUKURL; private String irusUKURL;
private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
public IrusStats(String irusUKURL) throws Exception { public IrusStats(String irusUKURL) throws Exception {
this.irusUKURL = irusUKURL; this.irusUKURL = irusUKURL;
// The following may not be needed - It will be created when JSON tables are created // The following may not be needed - It will be created when JSON tables are created
// createTmpTables(); // createTmpTables();
} }
public void reCreateLogDirs() throws Exception { public void reCreateLogDirs() throws Exception {
FileSystem dfs = FileSystem.get(new Configuration()); FileSystem dfs = FileSystem.get(new Configuration());
logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
} }
public void createTables() throws Exception { public void createTables() throws Exception {
try { try {
logger.info("Creating sushilog"); logger.info("Creating sushilog");
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog(source STRING, " + ".sushilog(source STRING, "
+ "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableSushiLog); stmt.executeUpdate(sqlCreateTableSushiLog);
logger.info("Created sushilog"); logger.info("Created sushilog");
// To see how to apply to the ignore duplicate rules and indexes stmt.close();
// stmt.executeUpdate(sqlCreateTableSushiLog); ConnectDB.getHiveConnection().close();
// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " logger.info("Sushi Tables Created");
// + " ON INSERT TO sushilog " } catch (Exception e) {
// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," logger.error("Failed to create tables: " + e);
// + "sushilog.rid, sushilog.date " throw new Exception("Failed to create tables: " + e.toString(), e);
// + "FROM sushilog " }
// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; }
// stmt.executeUpdate(sqlcreateRuleSushiLog);
// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
// stmt.executeUpdate(createSushiIndex);
stmt.close();
ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created");
} catch (Exception e) {
logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
// // The following may not be needed - It will be created when JSON tables are created public void processIrusStats() throws Exception {
// private void createTmpTables() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement();
// try { ConnectDB.getHiveConnection().setAutoCommit(false);
//
// Statement stmt = ConnectDB.getConnection().createStatement();
// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
// stmt.executeUpdate(sqlCreateTableSushiLog);
//
// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
// // stmt.executeUpdate(sqlCopyPublicSushiLog);
// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO sushilogtmp "
// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
// + "sushilogtmp.rid, sushilogtmp.date "
// + "FROM sushilogtmp "
// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
// stmt.executeUpdate(sqlcreateRuleSushiLog);
//
// stmt.close();
// ConnectDB.getConnection().close();
// log.info("Sushi Tmp Tables Created");
// } catch (Exception e) {
// log.error("Failed to create tables: " + e);
// throw new Exception("Failed to create tables: " + e.toString(), e);
// }
// }
public void processIrusStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Adding JSON Serde jar"); logger.info("Adding JSON Serde jar");
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
logger.info("Added JSON Serde jar"); logger.info("Added JSON Serde jar");
logger.info("Dropping sushilogtmp_json table"); logger.info("Dropping sushilogtmp_json table");
String dropSushilogtmpJson = "DROP TABLE IF EXISTS " String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ConnectDB.getUsageStatsDBSchema()
+ ".sushilogtmp_json"; + ".sushilogtmp_json";
stmt.executeUpdate(dropSushilogtmpJson); stmt.executeUpdate(dropSushilogtmpJson);
logger.info("Dropped sushilogtmp_json table"); logger.info("Dropped sushilogtmp_json table");
logger.info("Creating irus_sushilogtmp_json table"); logger.info("Creating irus_sushilogtmp_json table");
String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
+ " `ItemIdentifier` ARRAY<\n" + " `ItemIdentifier` ARRAY<\n"
+ " struct<\n" + " struct<\n"
+ " Type: STRING,\n" + " Type: STRING,\n"
+ " Value: STRING\n" + " Value: STRING\n"
+ " >\n" + " >\n"
+ " >,\n" + " >,\n"
+ " `ItemPerformance` ARRAY<\n" + " `ItemPerformance` ARRAY<\n"
+ " struct<\n" + " struct<\n"
+ " `Period`: struct<\n" + " `Period`: struct<\n"
+ " `Begin`: STRING,\n" + " `Begin`: STRING,\n"
+ " `End`: STRING\n" + " `End`: STRING\n"
+ " >,\n" + " >,\n"
+ " `Instance`: struct<\n" + " `Instance`: struct<\n"
+ " `Count`: STRING,\n" + " `Count`: STRING,\n"
+ " `MetricType`: STRING\n" + " `MetricType`: STRING\n"
+ " >\n" + " >\n"
+ " >\n" + " >\n"
+ " >\n" + " >\n"
+ ")\n" + ")\n"
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
+ "TBLPROPERTIES (\"transactional\"=\"false\")"; + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(createSushilogtmpJson); stmt.executeUpdate(createSushilogtmpJson);
logger.info("Created irus_sushilogtmp_json table"); logger.info("Created irus_sushilogtmp_json table");
logger.info("Dropping irus_sushilogtmp table"); logger.info("Dropping irus_sushilogtmp table");
String dropSushilogtmp = "DROP TABLE IF EXISTS " String dropSushilogtmp = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ConnectDB.getUsageStatsDBSchema()
+ ".irus_sushilogtmp"; + ".irus_sushilogtmp";
stmt.executeUpdate(dropSushilogtmp); stmt.executeUpdate(dropSushilogtmp);
logger.info("Dropped irus_sushilogtmp table"); logger.info("Dropped irus_sushilogtmp table");
logger.info("Creating irus_sushilogtmp table"); logger.info("Creating irus_sushilogtmp table");
String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ ".irus_sushilogtmp(source STRING, repository STRING, " + ".irus_sushilogtmp(source STRING, repository STRING, "
+ "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ "tblproperties('transactional'='true')"; + "tblproperties('transactional'='true')";
stmt.executeUpdate(createSushilogtmp); stmt.executeUpdate(createSushilogtmp);
logger.info("Created irus_sushilogtmp table"); logger.info("Created irus_sushilogtmp table");
logger.info("Inserting to irus_sushilogtmp table"); logger.info("Inserting to irus_sushilogtmp table");
String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
+ "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
+ "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
+ "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
+ "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
+ "WHERE `ItemIdent`.`Type`= 'OAI'"; + "WHERE `ItemIdent`.`Type`= 'OAI'";
stmt.executeUpdate(insertSushilogtmp); stmt.executeUpdate(insertSushilogtmp);
logger.info("Inserted to irus_sushilogtmp table"); logger.info("Inserted to irus_sushilogtmp table");
/*
logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
+ "`date` string, "
+ "`count` bigint, "
+ "`openaire` bigint)";
stmt.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats table");
logger.info("Inserting into downloads_stats"); logger.info("Inserting to sushilog table");
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ "SELECT s.source, d.id AS repository_id, " + ConnectDB.getUsageStatsDBSchema()
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + ".irus_sushilogtmp";
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " stmt.executeUpdate(insertToShushilog);
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, " logger.info("Inserted to sushilog table");
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
stmt.executeUpdate(insertDStats);
logger.info("Inserted into downloads_stats");
logger.info("Creating sushilog table"); ConnectDB.getHiveConnection().close();
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() }
+ ".sushilog "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`rid` string, "
+ "`date` string, "
+ "`metric_type` string, "
+ "`count` int)";
stmt.executeUpdate(createSushilog);
logger.info("Created sushilog table");
*/
logger.info("Inserting to sushilog table");
String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ ConnectDB.getUsageStatsDBSchema()
+ ".irus_sushilogtmp";
stmt.executeUpdate(insertToShushilog);
logger.info("Inserted to sushilog table");
ConnectDB.getHiveConnection().close(); public void getIrusRRReport(String irusUKReportPath) throws Exception {
} SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
// Setting the starting period
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
public void getIrusRRReport(String irusUKReportPath) throws Exception { // Setting the ending period (last day of the month)
SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
// Setting the starting period // end.add(Calendar.MONTH, +1);
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); // end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
// Setting the ending period (last day of the month) logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
+ "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
logger.info("(getIrusRRReport) Getting report: " + reportUrl); logger.info("(getIrusRRReport) Getting report: " + reportUrl);
String text = getJson(reportUrl, "", ""); String text = getJson(reportUrl, "", "");
List<String> opendoarsToVisit = new ArrayList<String>(); List<String> opendoarsToVisit = new ArrayList<String>();
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
JSONObject jsonObject = (JSONObject) parser.parse(text); JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("ReportResponse"); jsonObject = (JSONObject) jsonObject.get("ReportResponse");
jsonObject = (JSONObject) jsonObject.get("Report"); jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Report"); jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer"); jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
int i = 0; if (jsonArray != null) {
for (Object aJsonArray : jsonArray) { int i = 0;
JSONObject jsonObjectRow = (JSONObject) aJsonArray; for (Object aJsonArray : jsonArray) {
JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); JSONObject jsonObjectRow = (JSONObject) aJsonArray;
for (Object identifier : itemIdentifier) { JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
JSONObject opendoar = (JSONObject) identifier; for (Object identifier : itemIdentifier) {
if (opendoar.get("Type").toString().equals("OpenDOAR")) { JSONObject opendoar = (JSONObject) identifier;
i++; if (opendoar.get("Type").toString().equals("OpenDOAR")) {
opendoarsToVisit.add(opendoar.get("Value").toString()); i++;
break; opendoarsToVisit.add(opendoar.get("Value").toString());
} break;
} }
// break; }
} // break;
}
logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
&& ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
} }
logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
for (String opendoar : opendoarsToVisit) { for (String opendoar : opendoarsToVisit) {
logger.info("Now working on openDoar: " + opendoar); logger.info("Now working on openDoar: " + opendoar);
this.getIrusIRReport(opendoar, irusUKReportPath); this.getIrusIRReport(opendoar, irusUKReportPath);
} }
logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
} else {
logger.info("IRUS Reports not found for day");
}
logger.info("(getIrusRRReport) Finished with report: " + reportUrl); }
}
private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
// Setting the starting period // Setting the starting period
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
// Setting the ending period (last day of the month) // Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); Calendar end = Calendar.getInstance();
end.add(Calendar.MONTH, +1); end.add(Calendar.DAY_OF_MONTH, -1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
PreparedStatement st = ConnectDB // end.add(Calendar.MONTH, +1);
.getHiveConnection() // end.add(Calendar.DAY_OF_MONTH, -1);
.prepareStatement( logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
"SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
st.setString(1, "opendoar____::" + opendoar);
ResultSet rs_date = st.executeQuery();
Date dateMax = null;
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
dateMax = sdf.parse(rs_date.getString(1));
}
}
rs_date.close();
int batch_size = 0;
if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); PreparedStatement st = ConnectDB
} else { .getHiveConnection()
while (start.before(end)) { .prepareStatement(
logger.info("date: " + simpleDateFormat.format(start.getTime())); "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" st.setString(1, "opendoar____::" + opendoar);
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) ResultSet rs_date = st.executeQuery();
+ "&RepositoryIdentifier=opendoar%3A" + opendoar Date dateMax = null;
+ "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; while (rs_date.next()) {
start.add(Calendar.MONTH, 1); if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
dateMax = sdf.parse(rs_date.getString(1));
}
}
rs_date.close();
int batch_size = 0;
logger.info("Downloading file: " + reportUrl); if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
String text = getJson(reportUrl, "", ""); logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
if (text == null) { } else {
continue; start.add(Calendar.MONTH, 1);
} while (start.before(end)) {
logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime()));
String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ "&RepositoryIdentifier=opendoar%3A" + opendoar
+ "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
start.add(Calendar.MONTH, 1);
FileSystem fs = FileSystem.get(new Configuration()); logger.info("Downloading file: " + reportUrl);
String filePath = irusUKReportPath + "/" + "IrusIRReport_" String text = getJson(reportUrl, "", "");
+ opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; if (text == null) {
logger.info("Storing to file: " + filePath); continue;
FSDataOutputStream fin = fs.create(new Path(filePath), true); }
JSONParser parser = new JSONParser(); FileSystem fs = FileSystem.get(new Configuration());
JSONObject jsonObject = (JSONObject) parser.parse(text); String filePath = irusUKReportPath + "/" + "IrusIRReport_"
jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
jsonObject = (JSONObject) jsonObject.get("Report"); logger.info("Storing to file: " + filePath);
jsonObject = (JSONObject) jsonObject.get("Report"); FSDataOutputStream fin = fs.create(new Path(filePath), true);
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
if (jsonArray == null) {
continue;
}
String oai = "";
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
fin.write(jsonObjectRow.toJSONString().getBytes());
fin.writeChar('\n');
}
fin.close(); JSONParser parser = new JSONParser();
} JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("ReportResponse");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
if (jsonArray == null) {
continue;
}
String oai = "";
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
fin.write(jsonObjectRow.toJSONString().getBytes());
fin.writeChar('\n');
}
} fin.close();
//ConnectDB.getHiveConnection().close(); }
logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); }
} // ConnectDB.getHiveConnection().close();
private String getJson(String url) throws Exception { logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
try { }
System.out.println("===> Connecting to: " + url);
URL website = new URL(url);
System.out.println("Connection url -----> " + url);
URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded); private String getJson(String url) throws Exception {
StringBuilder response; try {
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { System.out.println("===> Connecting to: " + url);
response = new StringBuilder(); URL website = new URL(url);
String inputLine; System.out.println("Connection url -----> " + url);
while ((inputLine = in.readLine()) != null) { URLConnection connection = website.openConnection();
response.append(inputLine);
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
// response.append("\n"); // response.append("\n");
} }
} }
System.out.println("response ====> " + response.toString()); System.out.println("response ====> " + response.toString());
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to get URL: " + e); logger.error("Failed to get URL: " + e);
System.out.println("Failed to get URL: " + e); System.out.println("Failed to get URL: " + e);
throw new Exception("Failed to get URL: " + e.toString(), e); throw new Exception("Failed to get URL: " + e.toString(), e);
} }
} }
private String getJson(String url, String username, String password) throws Exception { private String getJson(String url, String username, String password) throws Exception {
// String cred=username+":"+password; // String cred=username+":"+password;
// String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
try { try {
URL website = new URL(url); URL website = new URL(url);
URLConnection connection = website.openConnection(); URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded); // connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response; StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder(); response = new StringBuilder();
String inputLine; String inputLine;
while ((inputLine = in.readLine()) != null) { while ((inputLine = in.readLine()) != null) {
response.append(inputLine); response.append(inputLine);
response.append("\n"); response.append("\n");
} }
} }
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to get URL", e); logger.error("Failed to get URL", e);
return null; return null;
} }
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export; package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*; import java.io.*;
@ -27,49 +28,49 @@ import org.slf4j.LoggerFactory;
*/ */
public class LaReferenciaDownloadLogs { public class LaReferenciaDownloadLogs {
private final String piwikUrl; private final String piwikUrl;
private Date startDate; private Date startDate;
private final String tokenAuth; private final String tokenAuth;
/* /*
* The Piwik's API method * The Piwik's API method
*/ */
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
private final String format = "&format=json"; private final String format = "&format=json";
private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
this.piwikUrl = piwikUrl; this.piwikUrl = piwikUrl;
this.tokenAuth = tokenAuth; this.tokenAuth = tokenAuth;
this.createTables(); this.createTables();
// this.createTmpTables(); // this.createTmpTables();
} }
public void reCreateLogDirs() throws IllegalArgumentException, IOException { public void reCreateLogDirs() throws IllegalArgumentException, IOException {
FileSystem dfs = FileSystem.get(new Configuration()); FileSystem dfs = FileSystem.get(new Configuration());
logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
} }
private void createTables() throws Exception { private void createTables() throws Exception {
try { try {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating LaReferencia tables"); logger.info("Creating LaReferencia tables");
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ "stored as orc tblproperties('transactional'='true')"; + "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog); stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables"); logger.info("Created LaReferencia tables");
// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " // String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialog " // + " ON INSERT TO lareferencialog "
// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," // + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
@ -80,16 +81,16 @@ public class LaReferenciaDownloadLogs {
// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); // stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); // stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
logger.info("Lareferencia Tables Created"); logger.info("Lareferencia Tables Created");
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to create tables: " + e); logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e); throw new Exception("Failed to create tables: " + e.toString(), e);
// System.exit(0); // System.exit(0);
} }
} }
// private void createTmpTables() throws Exception { // private void createTmpTables() throws Exception {
// //
@ -114,152 +115,159 @@ public class LaReferenciaDownloadLogs {
// // System.exit(0); // // System.exit(0);
// } // }
// } // }
private String getPiwikLogUrl() { private String getPiwikLogUrl() {
return piwikUrl + "/"; return piwikUrl + "/";
} }
private String getJson(String url) throws Exception { private String getJson(String url) throws Exception {
try { try {
URL website = new URL(url); URL website = new URL(url);
URLConnection connection = website.openConnection(); URLConnection connection = website.openConnection();
StringBuilder response; StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder(); response = new StringBuilder();
String inputLine; String inputLine;
while ((inputLine = in.readLine()) != null) { while ((inputLine = in.readLine()) != null) {
response.append(inputLine); response.append(inputLine);
// response.append("\n"); // response.append("\n");
} }
} }
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to get URL: " + e); logger.error("Failed to get URL: " + e);
throw new Exception("Failed to get URL: " + e.toString(), e); throw new Exception("Failed to get URL: " + e.toString(), e);
} }
} }
public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
String content = ""; String content = "";
List<Integer> siteIdsToVisit = new ArrayList<Integer>(); List<Integer> siteIdsToVisit = new ArrayList<Integer>();
// Getting all the siteIds in a list for logging reasons & limiting the list // Getting all the siteIds in a list for logging reasons & limiting the list
// to the max number of siteIds // to the max number of siteIds
content = getJson(baseApiUrl); content = getJson(baseApiUrl);
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
JSONArray jsonArray = (JSONArray) parser.parse(content); JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) { for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray; JSONObject jsonObjectRow = (JSONObject) aJsonArray;
siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
} }
logger.info("Found the following siteIds for download: " + siteIdsToVisit); logger.info("Found the following siteIds for download: " + siteIdsToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
&& ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
} }
logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
for (int siteId : siteIdsToVisit) { for (int siteId : siteIdsToVisit) {
logger.info("Now working on LaReferencia MatomoId: " + siteId); logger.info("Now working on LaReferencia MatomoId: " + siteId);
this.GetLaReFerenciaLogs(repoLogsPath, siteId); this.GetLaReFerenciaLogs(repoLogsPath, siteId);
} }
} }
public void GetLaReFerenciaLogs(String repoLogsPath, public void GetLaReFerenciaLogs(String repoLogsPath,
int laReferencialMatomoID) throws Exception { int laReferencialMatomoID) throws Exception {
logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// Setting the starting period // Setting the starting period
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
logger.info("Starting period for log download: " + sdf.format(start.getTime())); logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month) // Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1); // end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1); // end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime())); Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
PreparedStatement st = ConnectDB logger.info("Ending period for log download: " + sdf.format(end.getTime()));
.getHiveConnection()
.prepareStatement(
"SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".lareferencialog WHERE matomoid=?");
st.setInt(1, laReferencialMatomoID);
Date dateMax = null;
ResultSet rs_date = st.executeQuery(); PreparedStatement st = ConnectDB
while (rs_date.next()) { .getHiveConnection()
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") .prepareStatement(
&& !rs_date.getString(1).equals("")) { "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
start.setTime(sdf.parse(rs_date.getString(1))); + ".lareferencialog WHERE matomoid=?");
dateMax = sdf.parse(rs_date.getString(1)); st.setInt(1, laReferencialMatomoID);
} Date dateMax = null;
}
rs_date.close();
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { ResultSet rs_date = st.executeQuery();
Date date = currDay.getTime(); while (rs_date.next()) {
if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); && !rs_date.getString(1).equals("")) {
} else { start.setTime(sdf.parse(rs_date.getString(1)));
logger dateMax = sdf.parse(rs_date.getString(1));
.info( }
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " }
+ sdf.format(date)); rs_date.close();
String period = "&period=day&date=" + sdf.format(date); for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
String outFolder = ""; Date date = currDay.getTime();
outFolder = repoLogsPath; if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
logger
.info(
"Date found in logs " + dateMax + " and not downloanding Matomo logs for "
+ laReferencialMatomoID);
} else {
logger
.info(
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
+ sdf.format(date));
FileSystem fs = FileSystem.get(new Configuration()); String period = "&period=day&date=" + sdf.format(date);
FSDataOutputStream fin = fs String outFolder = "";
.create( outFolder = repoLogsPath;
new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
true);
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format FileSystem fs = FileSystem.get(new Configuration());
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; FSDataOutputStream fin = fs
String content = ""; .create(
int i = 0; new Path(
outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
true);
JSONParser parser = new JSONParser(); String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
do { + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String apiUrl = baseApiUrl; String content = "";
int i = 0;
if (i > 0) { JSONParser parser = new JSONParser();
apiUrl += "&filter_offset=" + (i * 1000); do {
} String apiUrl = baseApiUrl;
content = getJson(apiUrl); if (i > 0) {
if (content.length() == 0 || content.equals("[]")) { apiUrl += "&filter_offset=" + (i * 1000);
break; }
}
JSONArray jsonArray = (JSONArray) parser.parse(content); content = getJson(apiUrl);
for (Object aJsonArray : jsonArray) { if (content.length() == 0 || content.equals("[]")) {
JSONObject jsonObjectRaw = (JSONObject) aJsonArray; break;
fin.write(jsonObjectRaw.toJSONString().getBytes()); }
fin.writeChar('\n');
}
logger JSONArray jsonArray = (JSONArray) parser.parse(content);
.info( for (Object aJsonArray : jsonArray) {
"Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ " and for " fin.write(jsonObjectRaw.toJSONString().getBytes());
+ sdf.format(date)); fin.writeChar('\n');
i++; }
} while (true);
fin.close(); logger
} .info(
} "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
} + " and for "
+ sdf.format(date));
i++;
} while (true);
fin.close();
}
}
}
} }

View File

@ -61,15 +61,6 @@ public class LaReferenciaStats {
"stored as orc tblproperties('transactional'='true')"; "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog); stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables"); logger.info("Created LaReferencia tables");
// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialog "
// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
// + "FROM lareferencialog "
// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
@ -82,30 +73,6 @@ public class LaReferenciaStats {
} }
} }
// private void createTmpTables() throws Exception {
//
// try {
// Statement stmt = ConnectDB.getConnection().createStatement();
// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialogtmp "
// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
// + "FROM lareferencialogtmp "
// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
//
// stmt.close();
// log.info("Lareferencia Tmp Tables Created");
//
// } catch (Exception e) {
// log.error("Failed to create tmptables: " + e);
// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
// // System.exit(0);
// }
// }
public void processLogs() throws Exception { public void processLogs() throws Exception {
try { try {
logger.info("Processing LaReferencia repository logs"); logger.info("Processing LaReferencia repository logs");
@ -116,16 +83,7 @@ public class LaReferenciaStats {
removeDoubleClicks(); removeDoubleClicks();
logger.info("LaReferencia removed double clicks"); logger.info("LaReferencia removed double clicks");
/******** logger.info("LaReferencia updating Production Tables");
logger.info("LaReferencia creating viewsStats");
viewsStats();
logger.info("LaReferencia created viewsStats");
logger.info("LaReferencia creating downloadsStats");
downloadsStats();
logger.info("LaReferencia created downloadsStats");
************/
logger.info("LaReferencia updating Production Tables");
updateProdTables(); updateProdTables();
logger.info("LaReferencia updated Production Tables"); logger.info("LaReferencia updated Production Tables");
@ -255,88 +213,6 @@ public class LaReferenciaStats {
// conn.close(); // conn.close();
} }
public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating la_result_views_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS "
+
"SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='action' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created la_result_views_monthly_tmp view");
logger.info("Dropping la_views_stats_tmp table");
sql = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".la_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped la_views_stats_tmp table");
logger.info("Creating la_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " +
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(views) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.oid AND p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month";
stmt.executeUpdate(sql);
logger.info("Created la_views_stats_tmp table");
stmt.close();
ConnectDB.getHiveConnection().close();
}
private void downloadsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating la_result_downloads_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".la_result_downloads_monthly_tmp AS " +
"SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " +
"THEN 1 ELSE 0 END) AS openaire_referrer, " +
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='download' and " +
"(source_item_type='oaItem' or source_item_type='repItem') " +
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " +
"source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created la_result_downloads_monthly_tmp view");
logger.info("Dropping la_downloads_stats_tmp table");
sql = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped la_downloads_stats_tmp table");
logger.info("Creating la_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " +
"AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " +
"max(downloads) AS count, max(openaire_referrer) AS openaire " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " +
ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " +
"WHERE p.source=d.oid AND p.id=ro.oid " +
"GROUP BY d.id, ro.id, month " +
"ORDER BY d.id, ro.id, month";
stmt.executeUpdate(sql);
logger.info("Created la_downloads_stats_tmp table");
stmt.close();
ConnectDB.getHiveConnection().close();
}
private void updateProdTables() throws SQLException, Exception { private void updateProdTables() throws SQLException, Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
@ -346,40 +222,11 @@ public class LaReferenciaStats {
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " + String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; "select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
/*****
logger.info("Updating views_stats");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
stmt.executeUpdate(sql);
// sql = "insert into public.views_stats select * from la_views_stats_tmp;";
// stmt.executeUpdate(sql);
logger.info("Updating downloads_stats");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserting data to usage_stats from lareferencia");
sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " +
"SELECT coalesce(ds.source, vs.source) as source, " +
"coalesce(ds.repository_id, vs.repository_id) as repository_id, " +
"coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " +
"coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " +
"coalesce(ds.openaire, 0) as openaire_downloads, " +
"coalesce(vs.openaire, 0) as openaire_views " +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " +
ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " +
"AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
stmt.executeUpdate(sql);
logger.info("Inserted data to usage_stats from lareferencia");
// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;";
// stmt.executeUpdate(sql);
****/
logger.info("Dropping lareferencialogtmp"); logger.info("Dropping lareferencialogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp";
logger.info("Dropped lareferencialogtmp"); logger.info("Dropped lareferencialogtmp");
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();

View File

@ -1,9 +1,12 @@
package eu.dnetlib.oa.graph.usagerawdata.export; package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*; import java.io.*;
import java.net.Authenticator; import java.net.Authenticator;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.PreparedStatement; import java.sql.PreparedStatement;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.Statement; import java.sql.Statement;
@ -30,299 +33,299 @@ import org.slf4j.LoggerFactory;
*/ */
public class PiwikDownloadLogs { public class PiwikDownloadLogs {
private final String piwikUrl; private final String piwikUrl;
private Date startDate; private Date startDate;
private final String tokenAuth; private final String tokenAuth;
/* /*
* The Piwik's API method * The Piwik's API method
*/ */
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
private final String format = "&format=json"; private final String format = "&format=json";
private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
this.piwikUrl = piwikUrl; this.piwikUrl = piwikUrl;
this.tokenAuth = tokenAuth; this.tokenAuth = tokenAuth;
} }
private String getPiwikLogUrl() { private String getPiwikLogUrl() {
return "https://" + piwikUrl + "/"; return "https://" + piwikUrl + "/";
} }
private String getJson(String url) throws Exception { private String getJson(String url) throws Exception {
try { try {
logger.debug("Connecting to download the JSON: " + url); logger.debug("Connecting to download the JSON: " + url);
URL website = new URL(url); URL website = new URL(url);
URLConnection connection = website.openConnection(); URLConnection connection = website.openConnection();
StringBuilder response; StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder(); response = new StringBuilder();
String inputLine; String inputLine;
while ((inputLine = in.readLine()) != null) { while ((inputLine = in.readLine()) != null) {
response.append(inputLine); response.append(inputLine);
} }
} }
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to get URL: " + url + " Exception: " + e); logger.error("Failed to get URL: " + url + " Exception: " + e);
throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
} }
} }
class WorkerThread implements Runnable { class WorkerThread implements Runnable {
private Calendar currDay; private Calendar currDay;
private int siteId; private int siteId;
private String repoLogsPath; private String repoLogsPath;
private String portalLogPath; private String portalLogPath;
private String portalMatomoID; private String portalMatomoID;
public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
String portalMatomoID) throws IOException { String portalMatomoID) throws IOException {
this.currDay = (Calendar) currDay.clone(); this.currDay = (Calendar) currDay.clone();
this.siteId = new Integer(siteId); this.siteId = new Integer(siteId);
this.repoLogsPath = new String(repoLogsPath); this.repoLogsPath = new String(repoLogsPath);
this.portalLogPath = new String(portalLogPath); this.portalLogPath = new String(portalLogPath);
this.portalMatomoID = new String(portalMatomoID); this.portalMatomoID = new String(portalMatomoID);
} }
public void run() { public void run() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (Start) Thread for " Thread.currentThread().getName() + " (Start) Thread for "
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
try { try {
GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
e.printStackTrace(); e.printStackTrace();
} }
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (End) Thread for " Thread.currentThread().getName() + " (End) Thread for "
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
} }
public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
String portalMatomoID) throws Exception { String portalMatomoID) throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date date = currDay.getTime(); Date date = currDay.getTime();
logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
String period = "&period=day&date=" + sdf.format(date); String period = "&period=day&date=" + sdf.format(date);
String outFolder = ""; String outFolder = "";
if (siteId == Integer.parseInt(portalMatomoID)) { if (siteId == Integer.parseInt(portalMatomoID)) {
outFolder = portalLogPath; outFolder = portalLogPath;
} else { } else {
outFolder = repoLogsPath; outFolder = repoLogsPath;
} }
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String content = ""; String content = "";
int i = 0; int i = 0;
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
StringBuffer totalContent = new StringBuffer(); StringBuffer totalContent = new StringBuffer();
FileSystem fs = FileSystem.get(new Configuration()); FileSystem fs = FileSystem.get(new Configuration());
do { do {
int writtenBytes = 0; int writtenBytes = 0;
String apiUrl = baseApiUrl; String apiUrl = baseApiUrl;
if (i > 0) { if (i > 0) {
apiUrl += "&filter_offset=" + (i * 1000); apiUrl += "&filter_offset=" + (i * 1000);
} }
content = getJson(apiUrl); content = getJson(apiUrl);
if (content.length() == 0 || content.equals("[]")) { if (content.length() == 0 || content.equals("[]")) {
break; break;
} }
FSDataOutputStream fin = fs FSDataOutputStream fin = fs
.create( .create(
new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json"), + ".json"),
true); true);
JSONArray jsonArray = (JSONArray) parser.parse(content); JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) { for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRaw = (JSONObject) aJsonArray; JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
fin.write(jsonObjectRawBytes); fin.write(jsonObjectRawBytes);
fin.writeChar('\n'); fin.writeChar('\n');
writtenBytes += jsonObjectRawBytes.length + 1; writtenBytes += jsonObjectRawBytes.length + 1;
} }
fin.close(); fin.close();
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json"); + ".json");
i++; i++;
} while (true); } while (true);
fs.close(); fs.close();
} }
} }
public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
Statement statement = ConnectDB.getHiveConnection().createStatement(); Statement statement = ConnectDB.getHiveConnection().createStatement();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
ResultSet rs = statement ResultSet rs = statement
.executeQuery( .executeQuery(
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
// Getting all the piwikids in a list for logging reasons & limitting the list // Getting all the piwikids in a list for logging reasons & limitting the list
// to the max number of piwikids // to the max number of piwikids
List<Integer> piwikIdToVisit = new ArrayList<Integer>(); List<Integer> piwikIdToVisit = new ArrayList<Integer>();
//while (rs.next()) while (rs.next()) {
//piwikIdToVisit.add(rs.getInt(1)); piwikIdToVisit.add(rs.getInt(1));
piwikIdToVisit.add(13); }
piwikIdToVisit.add(109); logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
&& ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
} }
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
// ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
for (int siteId : piwikIdToVisit) { for (int siteId : piwikIdToVisit) {
// Setting the starting period // Setting the starting period
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
logger.info("Starting period for log download: " + sdf.format(start.getTime())); logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month) // Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1); Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1); end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime())); // end.add(Calendar.MONTH, +1);
// end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
logger.info("Now working on piwikId: " + siteId); logger.info("Now working on piwikId: " + siteId);
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
.prepareStatement( .prepareStatement(
"SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".piwiklog WHERE source=?"); + ".piwiklog WHERE source=?");
st.setInt(1, siteId); st.setInt(1, siteId);
Date dateMax = null; Date dateMax = null;
ResultSet rs_date = st.executeQuery(); ResultSet rs_date = st.executeQuery();
while (rs_date.next()) { while (rs_date.next()) {
logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) { && !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1))); start.setTime(sdf.parse(rs_date.getString(1)));
dateMax = sdf.parse(rs_date.getString(1)); dateMax = sdf.parse(rs_date.getString(1));
} }
} }
rs_date.close(); rs_date.close();
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
// logger.info("Date used " + currDay.toString()); // logger.info("Date used " + currDay.toString());
// Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
// executor.execute(worker);// calling execute method of ExecutorService // executor.execute(worker);// calling execute method of ExecutorService
logger.info("Date used " + currDay.getTime().toString()); logger.info("Date used " + currDay.getTime().toString());
if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
} else { } else {
GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
} }
} }
} }
// executor.shutdown(); // executor.shutdown();
// while (!executor.isTerminated()) { // while (!executor.isTerminated()) {
// } // }
// System.out.println("Finished all threads"); // System.out.println("Finished all threads");
} }
public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
String portalMatomoID) throws Exception { String portalMatomoID) throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date date = currDay.getTime(); Date date = currDay.getTime();
logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
String period = "&period=day&date=" + sdf.format(date); String period = "&period=day&date=" + sdf.format(date);
String outFolder = ""; String outFolder = "";
if (siteId == Integer.parseInt(portalMatomoID)) { if (siteId == Integer.parseInt(portalMatomoID)) {
outFolder = portalLogPath; outFolder = portalLogPath;
} else { } else {
outFolder = repoLogsPath; outFolder = repoLogsPath;
} }
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String content = ""; String content = "";
int i = 0; int i = 0;
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
StringBuffer totalContent = new StringBuffer(); StringBuffer totalContent = new StringBuffer();
FileSystem fs = FileSystem.get(new Configuration()); FileSystem fs = FileSystem.get(new Configuration());
do { do {
int writtenBytes = 0; int writtenBytes = 0;
String apiUrl = baseApiUrl; String apiUrl = baseApiUrl;
if (i > 0) { if (i > 0) {
apiUrl += "&filter_offset=" + (i * 1000); apiUrl += "&filter_offset=" + (i * 1000);
} }
content = getJson(apiUrl); content = getJson(apiUrl);
if (content.length() == 0 || content.equals("[]")) { if (content.length() == 0 || content.equals("[]")) {
break; break;
} }
FSDataOutputStream fin = fs FSDataOutputStream fin = fs
.create( .create(
new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json"), + ".json"),
true); true);
JSONArray jsonArray = (JSONArray) parser.parse(content); JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) { for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRaw = (JSONObject) aJsonArray; JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
fin.write(jsonObjectRawBytes); fin.write(jsonObjectRawBytes);
fin.writeChar('\n'); fin.writeChar('\n');
writtenBytes += jsonObjectRawBytes.length + 1; writtenBytes += jsonObjectRawBytes.length + 1;
} }
fin.close(); fin.close();
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json"); + ".json");
i++; i++;
} while (true); } while (true);
fs.close(); fs.close();
} }
} }

View File

@ -60,7 +60,7 @@ public class PiwikStatsDB {
this.createTables(); this.createTables();
// The piwiklog table is not needed since it is built // The piwiklog table is not needed since it is built
// on top of JSON files // on top of JSON files
////////////this.createTmpTables(); //////////// this.createTmpTables();
} }
public ArrayList getRobotsList() { public ArrayList getRobotsList() {
@ -86,6 +86,7 @@ public class PiwikStatsDB {
logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase); stmt.executeUpdate(dropDatabase);
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to drop database: " + e); logger.error("Failed to drop database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e); throw new Exception("Failed to drop database: " + e.toString(), e);
@ -117,10 +118,15 @@ public class PiwikStatsDB {
+ "into 100 buckets stored as orc tblproperties('transactional'='true')"; + "into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTablePiwikLog); stmt.executeUpdate(sqlCreateTablePiwikLog);
// String dropT = "TRUNCATE TABLE "
// + ConnectDB.getUsageStatsDBSchema()
// + ".piwiklog ";
// stmt.executeUpdate(dropT);
// logger.info("truncated piwiklog");
///////////////////////////////////////// /////////////////////////////////////////
// Rule for duplicate inserts @ piwiklog // Rule for duplicate inserts @ piwiklog
///////////////////////////////////////// /////////////////////////////////////////
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
@ -131,7 +137,6 @@ public class PiwikStatsDB {
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log // Rule for duplicate inserts @ process_portal_log
////////////////////////////////////////////////// //////////////////////////////////////////////////
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
@ -141,47 +146,6 @@ public class PiwikStatsDB {
} }
} }
/***** public void createTmpTables() throws Exception {
try {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ piwiklogtmp
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// Copy from public.piwiklog to piwiklog
//////////////////////////////////////////////////
// String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
// stmt.executeUpdate(sqlCopyPublicPiwiklog);
String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTmpTablePortalLog);
//////////////////////////////////////////////////
// Rule for duplicate inserts @ process_portal_log_tmp
//////////////////////////////////////////////////
stmt.close();
} catch (Exception e) {
logger.error("Failed to create tmptables: " + e);
throw new Exception("Failed to create tmp tables: " + e.toString(), e);
// System.exit(0);
}
}
******/
public void processLogs() throws Exception { public void processLogs() throws Exception {
try { try {
ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
@ -203,23 +167,17 @@ public class PiwikStatsDB {
processPortalLog(); processPortalLog();
logger.info("Portal logs process done"); logger.info("Portal logs process done");
logger.info("Processing portal usagestats"); logger.info("Processing portal usagestats");
portalStats(); portalLogs();
logger.info("Portal usagestats process done"); logger.info("Portal usagestats process done");
/*****
logger.info("ViewsStats processing starts");
viewsStats();
logger.info("ViewsStats processing ends");
logger.info("DownloadsStats processing starts");
downloadsStats();
logger.info("DownloadsStats processing starts");
*****/
logger.info("Updating Production Tables"); logger.info("Updating Production Tables");
updateProdTables(); updateProdTables();
logger.info("Updated Production Tables"); logger.info("Updated Production Tables");
logger.info("Create Pedocs Tables");
createPedocsOldUsageData();
logger.info("Pedocs Tables Created");
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to process logs: " + e); logger.error("Failed to process logs: " + e);
@ -237,65 +195,65 @@ public class PiwikStatsDB {
logger.info("Added JSON Serde jar"); logger.info("Added JSON Serde jar");
logger.info("Dropping piwiklogtmp_json table"); logger.info("Dropping piwiklogtmp_json table");
String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".piwiklogtmp_json"; + ".piwiklogtmp_json";
stmt.executeUpdate(drop_piwiklogtmp_json); stmt.executeUpdate(drop_piwiklogtmp_json);
logger.info("Dropped piwiklogtmp_json table"); logger.info("Dropped piwiklogtmp_json table");
logger.info("Creating piwiklogtmp_json"); logger.info("Creating piwiklogtmp_json");
String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".piwiklogtmp_json(\n" + + ".piwiklogtmp_json(\n"
" `idSite` STRING,\n" + + " `idSite` STRING,\n"
" `idVisit` STRING,\n" + + " `idVisit` STRING,\n"
" `country` STRING,\n" + + " `country` STRING,\n"
" `referrerName` STRING,\n" + + " `referrerName` STRING,\n"
" `browser` STRING,\n" + + " `browser` STRING,\n"
" `actionDetails` ARRAY<\n" + + " `actionDetails` ARRAY<\n"
" struct<\n" + + " struct<\n"
" type: STRING,\n" + + " type: STRING,\n"
" url: STRING,\n" + + " url: STRING,\n"
" `customVariables`: struct<\n" + + " `customVariables`: struct<\n"
" `1`: struct<\n" + + " `1`: struct<\n"
" `customVariablePageValue1`: STRING\n" + + " `customVariablePageValue1`: STRING\n"
" >\n" + + " >\n"
" >,\n" + + " >,\n"
" timestamp: String\n" + + " timestamp: String\n"
" >\n" + + " >\n"
" >\n" + + " >\n"
")\n" + + ")\n"
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
"LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n"
"TBLPROPERTIES (\"transactional\"=\"false\")"; + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_piwiklogtmp_json); stmt.executeUpdate(create_piwiklogtmp_json);
logger.info("Created piwiklogtmp_json"); logger.info("Created piwiklogtmp_json");
logger.info("Dropping piwiklogtmp table"); logger.info("Dropping piwiklogtmp table");
String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + String drop_piwiklogtmp = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".piwiklogtmp"; + ".piwiklogtmp";
stmt.executeUpdate(drop_piwiklogtmp); stmt.executeUpdate(drop_piwiklogtmp);
logger.info("Dropped piwiklogtmp"); logger.info("Dropped piwiklogtmp");
logger.info("Creating piwiklogtmp"); logger.info("Creating piwiklogtmp");
String create_piwiklogtmp = "CREATE TABLE " + String create_piwiklogtmp = "CREATE TABLE "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, "
"entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
"clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_piwiklogtmp); stmt.executeUpdate(create_piwiklogtmp);
logger.info("Created piwiklogtmp"); logger.info("Created piwiklogtmp");
logger.info("Inserting into piwiklogtmp"); logger.info("Inserting into piwiklogtmp");
String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, "
"actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.type as action, actiondetail.url as url, "
"actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, "
"'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, "
"referrerName as referrer_name, browser as agent\n" + + "referrerName as referrer_name, browser as agent\n"
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n"
"LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_piwiklogtmp); stmt.executeUpdate(insert_piwiklogtmp);
logger.info("Inserted into piwiklogtmp"); logger.info("Inserted into piwiklogtmp");
@ -308,33 +266,31 @@ public class PiwikStatsDB {
logger.info("Cleaning download double clicks"); logger.info("Cleaning download double clicks");
// clean download double clicks // clean download double clicks
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"WHERE EXISTS (\n" + + "WHERE EXISTS (\n"
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, "
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n"
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n"
"AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 \n"
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 \n" + + "AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n"
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" + + "AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Cleaned download double clicks"); logger.info("Cleaned download double clicks");
// clean view double clicks // clean view double clicks
logger.info("Cleaning action double clicks"); logger.info("Cleaning action double clicks");
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"WHERE EXISTS (\n" + + "WHERE EXISTS (\n"
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, "
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n"
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ + "AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n"
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n"
"AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n" + + "AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n"
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" + + "AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Cleaned action double clicks"); logger.info("Cleaned action double clicks");
stmt.close(); stmt.close();
@ -349,136 +305,107 @@ public class PiwikStatsDB {
logger.info("Added JSON Serde jar"); logger.info("Added JSON Serde jar");
logger.info("Dropping process_portal_log_tmp_json table"); logger.info("Dropping process_portal_log_tmp_json table");
String drop_process_portal_log_tmp_json = "DROP TABLE IF EXISTS " + String drop_process_portal_log_tmp_json = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".process_portal_log_tmp_json"; + ".process_portal_log_tmp_json";
stmt.executeUpdate(drop_process_portal_log_tmp_json); stmt.executeUpdate(drop_process_portal_log_tmp_json);
logger.info("Dropped process_portal_log_tmp_json table"); logger.info("Dropped process_portal_log_tmp_json table");
logger.info("Creating process_portal_log_tmp_json"); logger.info("Creating process_portal_log_tmp_json");
String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json(" + + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json("
" `idSite` STRING,\n" + + " `idSite` STRING,\n"
" `idVisit` STRING,\n" + + " `idVisit` STRING,\n"
" `country` STRING,\n" + + " `country` STRING,\n"
" `referrerName` STRING,\n" + + " `referrerName` STRING,\n"
" `browser` STRING,\n" + + " `browser` STRING,\n"
" `actionDetails` ARRAY<\n" + + " `actionDetails` ARRAY<\n"
" struct<\n" + + " struct<\n"
" type: STRING,\n" + + " type: STRING,\n"
" url: STRING,\n" + + " url: STRING,\n"
" timestamp: String\n" + + " timestamp: String\n"
" >\n" + + " >\n"
" >\n" + + " >\n"
")\n" + + ")\n"
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
"LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n"
"TBLPROPERTIES (\"transactional\"=\"false\")"; + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_process_portal_log_tmp_json); stmt.executeUpdate(create_process_portal_log_tmp_json);
logger.info("Created process_portal_log_tmp_json"); logger.info("Created process_portal_log_tmp_json");
logger.info("Droping process_portal_log_tmp table"); logger.info("Droping process_portal_log_tmp table");
String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".process_portal_log_tmp"; + ".process_portal_log_tmp";
stmt.executeUpdate(drop_process_portal_log_tmp); stmt.executeUpdate(drop_process_portal_log_tmp);
logger.info("Dropped process_portal_log_tmp"); logger.info("Dropped process_portal_log_tmp");
logger.info("Creating process_portal_log_tmp"); logger.info("Creating process_portal_log_tmp");
String create_process_portal_log_tmp = "CREATE TABLE " + String create_process_portal_log_tmp = "CREATE TABLE "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, "
"entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
"clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_process_portal_log_tmp); stmt.executeUpdate(create_process_portal_log_tmp);
logger.info("Created process_portal_log_tmp"); logger.info("Created process_portal_log_tmp");
logger.info("Inserting into process_portal_log_tmp"); logger.info("Inserting into process_portal_log_tmp");
String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".process_portal_log_tmp " + + ".process_portal_log_tmp "
"SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+ + "actiondetail.url as url, "
"actiondetail.url as url, " + + "CASE\n"
"CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] "
" WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] "
" WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
" WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] "
+ + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] "
" WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] "
" WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] "
" WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " ELSE '' "
" WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + "END AS entity_id, "
" ELSE '' " + + "CASE "
"END AS entity_id, " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' "
"CASE " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' "
" WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' "
" WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' "
" WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' "
" WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' "
" WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' "
" WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " ELSE '' "
" WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + "END AS source_item_type, "
" ELSE '' " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, "
"END AS source_item_type, " + + "browser as agent "
"from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json "
"browser as agent " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " +
"LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
stmt.executeUpdate(insert_process_portal_log_tmp); stmt.executeUpdate(insert_process_portal_log_tmp);
logger.info("Inserted into process_portal_log_tmp"); logger.info("Inserted into process_portal_log_tmp");
stmt.close(); stmt.close();
} }
public void portalStats() throws SQLException { public void portalLogs() throws SQLException {
Connection con = ConnectDB.getHiveConnection(); Connection con = ConnectDB.getHiveConnection();
Statement stmt = con.createStatement(); Statement stmt = con.createStatement();
con.setAutoCommit(false); con.setAutoCommit(false);
// Original queries where of the style
//
// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp2,
// openaire_prod_stats_20200821.result_oids roid
// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null
//
// The following query is an example of how queries should be
//
//
// INSERT INTO usagestats_20200907.piwiklogtmp
// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp
// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id
// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL);
//
// We should consider if we would like the queries to be as the following
//
// INSERT INTO usagestats_20200907.piwiklogtmp
// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent
// FROM usagestats_20200907.process_portal_log_tmp
// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id
// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND
// roid.oid != '');
logger.info("PortalStats - Step 1"); logger.info("PortalStats - Step 1");
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".result_oids roid WHERE roid.id IS NOT NULL)"; + ".result_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("PortalStats - Step 2"); logger.info("PortalStats - Step 2");
stmt = con.createStatement(); stmt = con.createStatement();
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".datasource_oids roid WHERE roid.id IS NOT NULL)"; + ".datasource_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
@ -494,12 +421,11 @@ public class PiwikStatsDB {
*/ */
logger.info("PortalStats - Step 3"); logger.info("PortalStats - Step 3");
stmt = con.createStatement(); stmt = con.createStatement();
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
"WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
"IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ ".project_oids roid WHERE roid.id IS NOT NULL)"; + ".project_oids roid WHERE roid.id IS NOT NULL)";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
@ -512,233 +438,233 @@ public class PiwikStatsDB {
logger.info("Cleaning oai - Step 1"); logger.info("Cleaning oai - Step 1");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/',"
"'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 2"); logger.info("Cleaning oai - Step 2");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/',"
"'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 3"); logger.info("Cleaning oai - Step 3");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/',"
"'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 4"); logger.info("Cleaning oai - Step 4");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/',"
"'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 5"); logger.info("Cleaning oai - Step 5");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/',"
"'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 6"); logger.info("Cleaning oai - Step 6");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/',"
"'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 7"); logger.info("Cleaning oai - Step 7");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/',"
"'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 8"); logger.info("Cleaning oai - Step 8");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/',"
"'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 9"); logger.info("Cleaning oai - Step 9");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/',"
"'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 10"); logger.info("Cleaning oai - Step 10");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/',"
"'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 11"); logger.info("Cleaning oai - Step 11");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/',"
"'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 12"); logger.info("Cleaning oai - Step 12");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/',"
"'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 13"); logger.info("Cleaning oai - Step 13");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/',"
"'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 14"); logger.info("Cleaning oai - Step 14");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/',"
"'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 15"); logger.info("Cleaning oai - Step 15");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/',"
"'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 16"); logger.info("Cleaning oai - Step 16");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/',"
"'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 17"); logger.info("Cleaning oai - Step 17");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/',"
"'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 18"); logger.info("Cleaning oai - Step 18");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/',"
"'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 19"); logger.info("Cleaning oai - Step 19");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/',"
"'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 20"); logger.info("Cleaning oai - Step 20");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/',"
"'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 21"); logger.info("Cleaning oai - Step 21");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/',"
"'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 22"); logger.info("Cleaning oai - Step 22");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/',"
"'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 23"); logger.info("Cleaning oai - Step 23");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/',"
"'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 24"); logger.info("Cleaning oai - Step 24");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/',"
"'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 25"); logger.info("Cleaning oai - Step 25");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/',"
"'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 26"); logger.info("Cleaning oai - Step 26");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/',"
"'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 27"); logger.info("Cleaning oai - Step 27");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/',"
"'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 28"); logger.info("Cleaning oai - Step 28");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/',"
"'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
logger.info("Cleaning oai - Step 29"); logger.info("Cleaning oai - Step 29");
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
"SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/',"
"'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
@ -746,63 +672,83 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
} }
private String processPortalURL(String url) {
if (url.indexOf("explore.openaire.eu") > 0) {
try {
url = URLDecoder.decode(url, "UTF-8");
} catch (Exception e) {
logger.info("Error when decoding the following URL: " + url);
}
if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
url = "datasource|"
+ url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
} else if (url.indexOf("datasource=") > 0
&& url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
} else if (url.indexOf("datasourceFilter=") > 0
&& url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
url = "datasource|"
+ url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
} else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
} else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
} else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46
&& !url.contains("oai:dnet:corda")) {
url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
} else if (url.indexOf("organizationId=") > 0
&& url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
url = "organization|"
+ url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
} else {
url = "";
}
} else {
url = "";
}
return url;
}
private void updateProdTables() throws SQLException { private void updateProdTables() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Inserting data to piwiklog"); logger.info("Inserting data to piwiklog");
String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Dropping piwiklogtmp"); logger.info("Dropping piwiklogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Dropped piwiklogtmp"); logger.info("Dropped piwiklogtmp");
logger.info("Dropping process_portal_log_tmp"); logger.info("Dropping process_portal_log_tmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Dropped process_portal_log_tmp"); logger.info("Dropped process_portal_log_tmp");
stmt.close();
ConnectDB.getHiveConnection().close();
}
public void finalizeStats() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping piwiklogtmp");
String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped piwiklogtmp");
logger.info("Dropping process_portal_log_tmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
stmt.executeUpdate(sql);
logger.info("Dropped process_portal_log_tmp");
logger.info("Dropping irus_sushilogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped irus_sushilogtmp");
logger.info("Dropping irus_sushilogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped irus_sushilogtmp_json");
logger.info("Dropping lareferencialogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped lareferencialogtmp_json");
logger.info("Dropping piwiklogtmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped piwiklogtmp_json");
logger.info("Dropping process_portal_log_tmp_json");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json";
stmt.executeUpdate(sql);
logger.info("Dropped process_portal_log_tmp_json");
logger.info("Dropping sarc_sushilogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp");
logger.info("Dropping sarc_sushilogtmp_json_array");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp_json_array");
logger.info("Dropping sarc_sushilogtmp_json_non_array");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
stmt.executeUpdate(sql);
logger.info("Dropped sarc_sushilogtmp_json_non_array");
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
@ -868,4 +814,22 @@ public class PiwikStatsDB {
private Connection getConnection() throws SQLException { private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection(); return ConnectDB.getHiveConnection();
} }
public void createPedocsOldUsageData() throws SQLException {
Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating PeDocs Old Views Table");
String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pedocsoldviews as select * from default.pedocsviews";
stmt.executeUpdate(sql);
logger.info("PeDocs Old Views Table created");
logger.info("Creating PeDocs Old Downloads Table");
sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pedocsolddownloads as select * from default.pedocsdownloads";
stmt.executeUpdate(sql);
logger.info("PeDocs Old Downloads Table created");
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagerawdata.export; package eu.dnetlib.oa.graph.usagerawdata.export;
import java.io.*; import java.io.*;
@ -33,543 +34,467 @@ import org.slf4j.LoggerFactory;
*/ */
public class SarcStats { public class SarcStats {
private Statement stmtHive = null; private Statement stmtHive = null;
private Statement stmtImpala = null; private Statement stmtImpala = null;
private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
public SarcStats() throws Exception { public SarcStats() throws Exception {
// createTables(); // createTables();
} }
private void createTables() throws Exception { private void createTables() throws Exception {
try { try {
stmtHive = ConnectDB.getHiveConnection().createStatement(); stmtHive = ConnectDB.getHiveConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
stmtHive.executeUpdate(sqlCreateTableSushiLog); stmtHive.executeUpdate(sqlCreateTableSushiLog);
// String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
// stmt.executeUpdate(sqlCopyPublicSushiLog); // stmt.executeUpdate(sqlCopyPublicSushiLog);
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO sushilog " + " ON INSERT TO sushilog "
+ " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ "sushilog.rid, sushilog.date " + "sushilog.rid, sushilog.date "
+ "FROM sushilog " + "FROM sushilog "
+ "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
stmtHive.executeUpdate(sqlcreateRuleSushiLog); stmtHive.executeUpdate(sqlcreateRuleSushiLog);
String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
stmtHive.executeUpdate(createSushiIndex); stmtHive.executeUpdate(createSushiIndex);
stmtHive.close(); stmtHive.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created"); logger.info("Sushi Tables Created");
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to create tables: " + e); logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e); throw new Exception("Failed to create tables: " + e.toString(), e);
} }
} }
public void reCreateLogDirs() throws IOException { public void reCreateLogDirs() throws IOException {
FileSystem dfs = FileSystem.get(new Configuration()); FileSystem dfs = FileSystem.get(new Configuration());
logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
} }
public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Adding JSON Serde jar"); logger.info("Adding JSON Serde jar");
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
logger.info("Added JSON Serde jar"); logger.info("Added JSON Serde jar");
logger.info("Dropping sarc_sushilogtmp_json_array table"); logger.info("Dropping sarc_sushilogtmp_json_array table");
String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
logger.info("Dropped sarc_sushilogtmp_json_array table"); logger.info("Dropped sarc_sushilogtmp_json_array table");
logger.info("Creating sarc_sushilogtmp_json_array table"); logger.info("Creating sarc_sushilogtmp_json_array table");
String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
+ " `ItemIdentifier` ARRAY<\n" + " `ItemIdentifier` ARRAY<\n"
+ " struct<\n" + " struct<\n"
+ " `Type`: STRING,\n" + " `Type`: STRING,\n"
+ " `Value`: STRING\n" + " `Value`: STRING\n"
+ " >\n" + " >\n"
+ " >,\n" + " >,\n"
+ " `ItemPerformance` struct<\n" + " `ItemPerformance` struct<\n"
+ " `Period`: struct<\n" + " `Period`: struct<\n"
+ " `Begin`: STRING,\n" + " `Begin`: STRING,\n"
+ " `End`: STRING\n" + " `End`: STRING\n"
+ " >,\n" + " >,\n"
+ " `Instance`: struct<\n" + " `Instance`: struct<\n"
+ " `Count`: STRING,\n" + " `Count`: STRING,\n"
+ " `MetricType`: STRING\n" + " `MetricType`: STRING\n"
+ " >\n" + " >\n"
+ " >\n" + " >\n"
+ ")" + ")"
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ "LOCATION '" + sarcsReportPathArray + "/'\n" + "LOCATION '" + sarcsReportPathArray + "/'\n"
+ "TBLPROPERTIES (\"transactional\"=\"false\")"; + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_sarc_sushilogtmp_json_array); stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
logger.info("Created sarc_sushilogtmp_json_array table"); logger.info("Created sarc_sushilogtmp_json_array table");
logger.info("Dropping sarc_sushilogtmp_json_non_array table"); logger.info("Dropping sarc_sushilogtmp_json_non_array table");
String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp_json_non_array"; + ".sarc_sushilogtmp_json_non_array";
stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
logger.info("Dropped sarc_sushilogtmp_json_non_array table"); logger.info("Dropped sarc_sushilogtmp_json_non_array table");
logger.info("Creating sarc_sushilogtmp_json_non_array table"); logger.info("Creating sarc_sushilogtmp_json_non_array table");
String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
+ " `ItemIdentifier` struct<\n" + " `ItemIdentifier` struct<\n"
+ " `Type`: STRING,\n" + " `Type`: STRING,\n"
+ " `Value`: STRING\n" + " `Value`: STRING\n"
+ " >,\n" + " >,\n"
+ " `ItemPerformance` struct<\n" + " `ItemPerformance` struct<\n"
+ " `Period`: struct<\n" + " `Period`: struct<\n"
+ " `Begin`: STRING,\n" + " `Begin`: STRING,\n"
+ " `End`: STRING\n" + " `End`: STRING\n"
+ " >,\n" + " >,\n"
+ " `Instance`: struct<\n" + " `Instance`: struct<\n"
+ " `Count`: STRING,\n" + " `Count`: STRING,\n"
+ " `MetricType`: STRING\n" + " `MetricType`: STRING\n"
+ " >\n" + " >\n"
+ " >" + " >"
+ ")" + ")"
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ "LOCATION '" + sarcsReportPathNonArray + "/'\n" + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
+ "TBLPROPERTIES (\"transactional\"=\"false\")"; + "TBLPROPERTIES (\"transactional\"=\"false\")";
stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
logger.info("Created sarc_sushilogtmp_json_non_array table"); logger.info("Created sarc_sushilogtmp_json_non_array table");
logger.info("Creating sarc_sushilogtmp table"); logger.info("Creating sarc_sushilogtmp table");
String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp(source STRING, repository STRING, " + ".sarc_sushilogtmp(source STRING, repository STRING, "
+ "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ "tblproperties('transactional'='true')"; + "tblproperties('transactional'='true')";
stmt.executeUpdate(create_sarc_sushilogtmp); stmt.executeUpdate(create_sarc_sushilogtmp);
logger.info("Created sarc_sushilogtmp table"); logger.info("Created sarc_sushilogtmp table");
logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
+ "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ "WHERE `ItemIdent`.`Type`='DOI'"; + "WHERE `ItemIdent`.`Type`='DOI'";
stmt.executeUpdate(insert_sarc_sushilogtmp); stmt.executeUpdate(insert_sarc_sushilogtmp);
logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
stmt.executeUpdate(insert_sarc_sushilogtmp); stmt.executeUpdate(insert_sarc_sushilogtmp);
logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
} }
public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating sushilog table"); logger.info("Creating sushilog table");
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog " + ".sushilog "
+ "(`source` string, " + "(`source` string, "
+ "`repository` string, " + "`repository` string, "
+ "`rid` string, " + "`rid` string, "
+ "`date` string, " + "`date` string, "
+ "`metric_type` string, " + "`metric_type` string, "
+ "`count` int)"; + "`count` int)";
stmt.executeUpdate(createSushilog); stmt.executeUpdate(createSushilog);
logger.info("Created sushilog table"); logger.info("Created sushilog table");
logger.info("Dropping sarc_sushilogtmp table"); logger.info("Dropping sarc_sushilogtmp table");
String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp"; + ".sarc_sushilogtmp";
stmt.executeUpdate(drop_sarc_sushilogtmp); stmt.executeUpdate(drop_sarc_sushilogtmp);
logger.info("Dropped sarc_sushilogtmp table"); logger.info("Dropped sarc_sushilogtmp table");
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
List<String[]> issnAndUrls = new ArrayList<String[]>(); List<String[]> issnAndUrls = new ArrayList<String[]>();
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
}); });
issnAndUrls.add(new String[]{ issnAndUrls.add(new String[] {
"https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
}); });
if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
&& ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
} }
logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
for (String[] issnAndUrl : issnAndUrls) { for (String[] issnAndUrl : issnAndUrls) {
logger.info("Now working on ISSN: " + issnAndUrl[1]); logger.info("Now working on ISSN: " + issnAndUrl[1]);
getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
} }
} }
public void finalizeSarcStats() throws Exception { public void updateSarcLogs() throws Exception {
stmtHive = ConnectDB.getHiveConnection().createStatement(); stmtHive = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
stmtImpala = ConnectDB.getImpalaConnection().createStatement(); stmtImpala = ConnectDB.getImpalaConnection().createStatement();
/*
logger.info("Creating downloads_stats table_tmp");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_tmp "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
+ "`date` string, "
+ "`count` bigint, "
+ "`openaire` bigint)";
stmtHive.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats_tmp table");
logger.info("Dropping sarc_sushilogtmp_impala table"); // Insert into sushilog
String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " logger.info("Inserting into sushilog");
+ ConnectDB.getUsageStatsDBSchema() String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_sushilogtmp_impala"; + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); stmtHive.executeUpdate(insertSushiLog);
logger.info("Dropped sarc_sushilogtmp_impala table"); logger.info("Inserted into sushilog");
logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); stmtHive.close();
String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() ConnectDB.getHiveConnection().close();
+ ".sarc_sushilogtmp_impala " }
+ "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmtHive.executeUpdate(createSarcSushilogtmpImpala);
logger.info("Created sarc_sushilogtmp_impala");
logger.info("Making sarc_sushilogtmp visible to impala"); public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() String url, String issn) throws Exception {
+ ".sarc_sushilogtmp_impala;"; logger.info("Processing SARC! issn: " + issn + " with url: " + url);
stmtImpala.executeUpdate(invalidateMetadata); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping downloads_stats_impala table"); SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " // Setting the starting period
+ ConnectDB.getUsageStatsDBSchema() Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ ".downloads_stats_impala"; logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
stmtHive.executeUpdate(drop_downloads_stats_impala);
logger.info("Dropped downloads_stats_impala table");
logger.info("Making downloads_stats_impala deletion visible to impala"); // Setting the ending period (last day of the month)
try { // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() // end.add(Calendar.MONTH, +1);
+ ".downloads_stats_impala;"; // end.add(Calendar.DAY_OF_MONTH, -1);
stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); Calendar end = Calendar.getInstance();
} catch (SQLException sqle) { end.add(Calendar.DAY_OF_MONTH, -1);
}
// We run the following query in Impala because it is faster logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
logger.info("Creating downloads_stats_impala");
String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ ".downloads_stats_impala AS "
+ "SELECT s.source, d.id AS repository_id, "
+ "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
+ "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ ConnectDB.getStatsDBSchema() + ".result_pids ro "
+ "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
stmtImpala.executeUpdate(createDownloadsStatsImpala);
logger.info("Creating downloads_stats_impala");
// Insert into downloads_stats SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); PreparedStatement st = ConnectDB
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() .getHiveConnection()
+ ".downloads_stats_tmp SELECT * " .prepareStatement(
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
stmtHive.executeUpdate(insertDStats); st.setString(1, issn);
logger.info("Inserted into downloads_stats_tmp"); ResultSet rs_date = st.executeQuery();
Date dateMax = null;
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
dateMax = sdf.parse(rs_date.getString(1));
}
}
rs_date.close();
logger.info("Creating sushilog table"); // Creating the needed configuration for the correct storing of data
String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() Configuration config = new Configuration();
+ ".sushilog " config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
+ "(`source` string, " config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
+ "`repository_id` string, " config
+ "`rid` string, " .set(
+ "`date` string, " "fs.hdfs.impl",
+ "`metric_type` string, " org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ "`count` int)"; config
stmtHive.executeUpdate(createSushilog); .set(
logger.info("Created sushilog table"); "fs.file.impl",
*/ org.apache.hadoop.fs.LocalFileSystem.class.getName());
// Insert into sushilog FileSystem dfs = FileSystem.get(config);
logger.info("Inserting into sushilog");
String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
stmtHive.executeUpdate(insertSushiLog);
logger.info("Inserted into sushilog");
stmtHive.close(); if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
ConnectDB.getHiveConnection().close(); logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
} } else {
start.add(Calendar.MONTH, 1);
while (start.before(end)) {
String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
start.add(Calendar.MONTH, 1);
public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, logger.info("(getARReport) Getting report: " + reportUrl);
String url, String issn) throws Exception { String text = getJson(reportUrl);
logger.info("Processing SARC! issn: " + issn + " with url: " + url); if (text == null) {
ConnectDB.getHiveConnection().setAutoCommit(false); continue;
}
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); JSONParser parser = new JSONParser();
// Setting the starting period JSONObject jsonObject = null;
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); try {
logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); jsonObject = (JSONObject) parser.parse(text);
} // if there is a parsing error continue with the next url
catch (ParseException pe) {
continue;
}
// Setting the ending period (last day of the month) jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); jsonObject = (JSONObject) jsonObject.get("sc:Report");
end.add(Calendar.MONTH, +1); if (jsonObject == null) {
end.add(Calendar.DAY_OF_MONTH, -1); continue;
logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); }
jsonObject = (JSONObject) jsonObject.get("c:Report");
jsonObject = (JSONObject) jsonObject.get("c:Customer");
Object obj = jsonObject.get("c:ReportItems");
JSONArray jsonArray = new JSONArray();
if (obj instanceof JSONObject) {
jsonArray.add(obj);
} else {
jsonArray = (JSONArray) obj;
// jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
}
if (jsonArray == null) {
continue;
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); // Creating the file in the filesystem for the ItemIdentifier as array object
PreparedStatement st = ConnectDB String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
.getHiveConnection() + simpleDateFormat.format(start.getTime()) + ".json";
.prepareStatement( logger.info("Storing to file: " + filePathArray);
"SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
st.setString(1, issn);
ResultSet rs_date = st.executeQuery();
Date dateMax = null;
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
dateMax = sdf.parse(rs_date.getString(1));
}
}
rs_date.close();
// Creating the needed configuration for the correct storing of data // Creating the file in the filesystem for the ItemIdentifier as array object
Configuration config = new Configuration(); String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + simpleDateFormat.format(start.getTime()) + ".json";
config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); logger.info("Storing to file: " + filePathNonArray);
config FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
.set(
"fs.hdfs.impl",
org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
config
.set(
"fs.file.impl",
org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem dfs = FileSystem.get(config);
if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { for (Object aJsonArray : jsonArray) {
logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
} else {
while (start.before(end)) {
String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
start.add(Calendar.MONTH, 1);
logger.info("(getARReport) Getting report: " + reportUrl); JSONObject jsonObjectRow = (JSONObject) aJsonArray;
String text = getJson(reportUrl); renameKeysRecursively(":", jsonObjectRow);
if (text == null) {
continue;
}
JSONParser parser = new JSONParser(); if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
JSONObject jsonObject = null; finNonArray.write(jsonObjectRow.toJSONString().getBytes());
try { finNonArray.writeChar('\n');
jsonObject = (JSONObject) parser.parse(text); } else {
} // if there is a parsing error continue with the next url finArray.write(jsonObjectRow.toJSONString().getBytes());
catch (ParseException pe) { finArray.writeChar('\n');
continue; }
} }
jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); finArray.close();
jsonObject = (JSONObject) jsonObject.get("sc:Report"); finNonArray.close();
if (jsonObject == null) {
continue;
}
jsonObject = (JSONObject) jsonObject.get("c:Report");
jsonObject = (JSONObject) jsonObject.get("c:Customer");
Object obj = jsonObject.get("c:ReportItems");
JSONArray jsonArray = new JSONArray();
if (obj instanceof JSONObject) {
jsonArray.add(obj);
} else {
jsonArray = (JSONArray) obj;
// jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
}
if (jsonArray == null) {
continue;
}
// Creating the file in the filesystem for the ItemIdentifier as array object // Check the file size and if it is too big, delete it
String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" File fileArray = new File(filePathArray);
+ simpleDateFormat.format(start.getTime()) + ".json"; if (fileArray.length() == 0) {
logger.info("Storing to file: " + filePathArray); fileArray.delete();
FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); }
File fileNonArray = new File(filePathNonArray);
if (fileNonArray.length() == 0) {
fileNonArray.delete();
}
// Creating the file in the filesystem for the ItemIdentifier as array object }
String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
+ simpleDateFormat.format(start.getTime()) + ".json";
logger.info("Storing to file: " + filePathNonArray);
FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
for (Object aJsonArray : jsonArray) { dfs.close();
}
// ConnectDB.getHiveConnection().close();
}
JSONObject jsonObjectRow = (JSONObject) aJsonArray; private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
renameKeysRecursively(":", jsonObjectRow); for (Object jjval : givenJsonObj) {
if (jjval instanceof JSONArray) {
if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { renameKeysRecursively(delimiter, (JSONArray) jjval);
finNonArray.write(jsonObjectRow.toJSONString().getBytes()); } else if (jjval instanceof JSONObject) {
finNonArray.writeChar('\n'); renameKeysRecursively(delimiter, (JSONObject) jjval);
} else { } // All other types of vals
finArray.write(jsonObjectRow.toJSONString().getBytes()); else
finArray.writeChar('\n');
}
}
finArray.close();
finNonArray.close();
// Check the file size and if it is too big, delete it
File fileArray = new File(filePathArray);
if (fileArray.length() == 0)
fileArray.delete();
File fileNonArray = new File(filePathNonArray);
if (fileNonArray.length() == 0)
fileNonArray.delete();
}
dfs.close();
}
//ConnectDB.getHiveConnection().close();
}
private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
for (Object jjval : givenJsonObj) {
if (jjval instanceof JSONArray) {
renameKeysRecursively(delimiter, (JSONArray) jjval);
} else if (jjval instanceof JSONObject) {
renameKeysRecursively(delimiter, (JSONObject) jjval);
} // All other types of vals
else
; ;
} }
} }
private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
Set<String> jkeys = new HashSet<String>(givenJsonObj.keySet()); Set<String> jkeys = new HashSet<String>(givenJsonObj.keySet());
for (String jkey : jkeys) { for (String jkey : jkeys) {
String[] splitArray = jkey.split(delimiter); String[] splitArray = jkey.split(delimiter);
String newJkey = splitArray[splitArray.length - 1]; String newJkey = splitArray[splitArray.length - 1];
Object jval = givenJsonObj.get(jkey); Object jval = givenJsonObj.get(jkey);
givenJsonObj.remove(jkey); givenJsonObj.remove(jkey);
givenJsonObj.put(newJkey, jval); givenJsonObj.put(newJkey, jval);
if (jval instanceof JSONObject) { if (jval instanceof JSONObject) {
renameKeysRecursively(delimiter, (JSONObject) jval); renameKeysRecursively(delimiter, (JSONObject) jval);
} }
if (jval instanceof JSONArray) { if (jval instanceof JSONArray) {
renameKeysRecursively(delimiter, (JSONArray) jval); renameKeysRecursively(delimiter, (JSONArray) jval);
} }
} }
} }
private String getJson(String url) throws Exception { private String getJson(String url) throws Exception {
// String cred=username+":"+password; // String cred=username+":"+password;
// String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
try { try {
URL website = new URL(url); URL website = new URL(url);
URLConnection connection = website.openConnection(); URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded); // connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response; StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder(); response = new StringBuilder();
String inputLine; String inputLine;
while ((inputLine = in.readLine()) != null) { while ((inputLine = in.readLine()) != null) {
response.append(inputLine); response.append(inputLine);
response.append("\n"); response.append("\n");
} }
} }
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
// Logging error and silently continuing // Logging error and silently continuing
logger.error("Failed to get URL: " + e); logger.error("Failed to get URL: " + e);
System.out.println("Failed to get URL: " + e); System.out.println("Failed to get URL: " + e);
// return null; // return null;
// throw new Exception("Failed to get URL: " + e.toString(), e); // throw new Exception("Failed to get URL: " + e.toString(), e);
} }
return ""; return "";
} }
} }

View File

@ -13,7 +13,7 @@ import org.slf4j.LoggerFactory;
/** /**
* Main class for downloading and processing Usage statistics * Main class for downloading and processing Usage statistics
* *
* @author D. Pierrakos, S. Zoupanos * @author D. Pierrakos, S. Zoupanos
*/ */
public class UsageStatsExporter { public class UsageStatsExporter {
@ -51,19 +51,13 @@ public class UsageStatsExporter {
logger.info("Initialising DB properties"); logger.info("Initialising DB properties");
ConnectDB.init(); ConnectDB.init();
// runImpalaQuery();
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Re-creating database and tables"); logger.info("Re-creating database and tables");
if (ExecuteWorkflow.recreateDbAndTables){ if (ExecuteWorkflow.recreateDbAndTables) {
piwikstatsdb.recreateDBAndTables(); piwikstatsdb.recreateDBAndTables();
logger.info("DB-Tables-TmpTables are created "); logger.info("DB-Tables-TmpTables are created ");
} }
// else {
// piwikstatsdb.createTmpTables();
// logger.info("TmpTables are created ");
// }
logger.info("Initializing the download logs module"); logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
@ -106,9 +100,8 @@ public class UsageStatsExporter {
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs"); logger.info("Downloaded LaReferencia logs");
} }
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
if (ExecuteWorkflow.processLaReferenciaLogs) { if (ExecuteWorkflow.processLaReferenciaLogs) {
logger.info("Processing LaReferencia logs"); logger.info("Processing LaReferencia logs");
@ -116,7 +109,6 @@ public class UsageStatsExporter {
logger.info("LaReferencia logs done"); logger.info("LaReferencia logs done");
} }
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
logger.info("Creating Irus Stats tables"); logger.info("Creating Irus Stats tables");
@ -132,14 +124,11 @@ public class UsageStatsExporter {
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
} }
if (ExecuteWorkflow.irusProcessStats) {
if (ExecuteWorkflow.irusProcessStats) {
irusstats.processIrusStats(); irusstats.processIrusStats();
logger.info("Irus done"); logger.info("Irus done");
} }
SarcStats sarcStats = new SarcStats(); SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
sarcStats.reCreateLogDirs(); sarcStats.reCreateLogDirs();
@ -148,51 +137,70 @@ public class UsageStatsExporter {
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
} }
if (ExecuteWorkflow.sarcProcessStats) {
if (ExecuteWorkflow.sarcProcessStats) {
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
sarcStats.finalizeSarcStats(); sarcStats.updateSarcLogs();
} }
logger.info("Sarc done"); logger.info("Sarc done");
/*
// finalize usagestats // finalize usagestats
logger.info("Dropping tmp tables");
if (ExecuteWorkflow.finalizeStats) { if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats(); piwikstatsdb.finalizeStats();
logger.info("Finalized stats"); logger.info("Dropped tmp tables");
} }
*/
/* logger.info("Raw Data Download End");
// Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
*/
logger.info("End");
} }
private void invalidateMetadata() throws SQLException { public void createdDBWithTablesOnly() throws Exception {
Statement stmt = null; logger.info("Initialising DB properties");
ConnectDB.init();
stmt = ConnectDB.getImpalaConnection().createStatement(); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
piwikstatsdb.recreateDBAndTables();
String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; piwikstatsdb.createPedocsOldUsageData();
Statement stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating LaReferencia tables");
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ "stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
logger.info("Created LaReferencia tables");
logger.info("Creating sushilog");
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog(source STRING, "
+ "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableSushiLog);
logger.info("Created sushilog");
logger.info("Updating piwiklog");
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".piwiklog select * from openaire_prod_usage_raw.piwiklog";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; logger.info("Updating lareferencialog");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; logger.info("Updating sushilog");
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ ".sushilog select * from openaire_prod_usage_raw.sushilog";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; stmt.close();
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created");
} }
} }

View File

@ -125,12 +125,6 @@
"paramDescription": "Starting log period", "paramDescription": "Starting log period",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "elp",
"paramLongName": "endingLogPeriod",
"paramDescription": "Ending log period",
"paramRequired": true
},
{ {
"paramName": "npidd", "paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload", "paramLongName": "numberOfPiwikIdsToDownload",
@ -216,12 +210,6 @@
"paramDescription": "Create the usage_stats table?", "paramDescription": "Create the usage_stats table?",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "ftvi",
"paramLongName": "finalTablesVisibleToImpala",
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
"paramRequired": true
},
{ {
"paramName": "nodt", "paramName": "nodt",
"paramLongName": "numberOfDownloadThreads", "paramLongName": "numberOfDownloadThreads",

View File

@ -63,7 +63,6 @@
<arg>--downloadPiwikLogs</arg><arg>${downloadPiwikLogs}</arg> <arg>--downloadPiwikLogs</arg><arg>${downloadPiwikLogs}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg> <arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg> <arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg> <arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg> <arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--laReferenciaEmptyDirs</arg><arg>${laReferenciaEmptyDirs}</arg> <arg>--laReferenciaEmptyDirs</arg><arg>${laReferenciaEmptyDirs}</arg>
@ -78,7 +77,6 @@
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg> <arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg> <arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg> <arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg> <arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>
<capture-output/> <capture-output/>
</java> </java>

View File

@ -23,7 +23,35 @@
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-stats-build</artifactId> <artifactId>dhp-usage-stats-build</artifactId>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.15</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<!-- more config here as you see fit -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>

View File

@ -0,0 +1 @@
mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild

View File

@ -3,12 +3,17 @@
* To change this template file, choose Tools | Templates * To change this template file, choose Tools | Templates
* and open the template in the editor. * and open the template in the editor.
*/ */
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.sql.Connection; import java.sql.Connection;
import java.sql.DriverManager; import java.sql.DriverManager;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Statement; import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties; import java.util.Properties;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -23,108 +28,120 @@ import com.mchange.v2.c3p0.ComboPooledDataSource;
public abstract class ConnectDB { public abstract class ConnectDB {
public static Connection DB_HIVE_CONNECTION; public static Connection DB_HIVE_CONNECTION;
public static Connection DB_IMPALA_CONNECTION; public static Connection DB_IMPALA_CONNECTION;
private static String dbHiveUrl; private static String dbHiveUrl;
private static String dbImpalaUrl; private static String dbImpalaUrl;
private static String usageRawDataDBSchema; private static String usageRawDataDBSchema;
private static String usageStatsDBSchema; private static String usageStatsDBSchema;
private static String statsDBSchema; private static String usagestatsPermanentDBSchema;
private final static Logger log = Logger.getLogger(ConnectDB.class); private static String statsDBSchema;
private final static Logger log = Logger.getLogger(ConnectDB.class);
static void init() throws ClassNotFoundException { static void init() throws ClassNotFoundException {
dbHiveUrl = ExecuteWorkflow.dbHiveUrl; dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
statsDBSchema = ExecuteWorkflow.statsDBSchema; statsDBSchema = ExecuteWorkflow.statsDBSchema;
usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema;
usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema;
Class.forName("org.apache.hive.jdbc.HiveDriver"); Class.forName("org.apache.hive.jdbc.HiveDriver");
} }
public static Connection getHiveConnection() throws SQLException { public static Connection getHiveConnection() throws SQLException {
if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
return DB_HIVE_CONNECTION; return DB_HIVE_CONNECTION;
} else { } else {
DB_HIVE_CONNECTION = connectHive(); DB_HIVE_CONNECTION = connectHive();
return DB_HIVE_CONNECTION; return DB_HIVE_CONNECTION;
} }
} }
public static Connection getImpalaConnection() throws SQLException { public static Connection getImpalaConnection() throws SQLException {
if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
return DB_IMPALA_CONNECTION; return DB_IMPALA_CONNECTION;
} else { } else {
DB_IMPALA_CONNECTION = connectImpala(); DB_IMPALA_CONNECTION = connectImpala();
return DB_IMPALA_CONNECTION; return DB_IMPALA_CONNECTION;
} }
} }
public static String getUsageRawDataDBSchema() { public static String getUsageRawDataDBSchema() {
return usageRawDataDBSchema; return ConnectDB.usageRawDataDBSchema;
} }
public static String getUsageStatsDBSchema() { public static String getUsageStatsDBSchema() {
return ConnectDB.usageStatsDBSchema; String datePattern = "YYYYMMdd";
} DateFormat df = new SimpleDateFormat(datePattern);
// Get the today date using Calendar object.
Date today = Calendar.getInstance().getTime();
String todayAsString = df.format(today);
public static String getStatsDBSchema() { return ConnectDB.usageStatsDBSchema + "_" + todayAsString;
return ConnectDB.statsDBSchema; }
}
private static Connection connectHive() throws SQLException { public static String getStatsDBSchema() {
/* return ConnectDB.statsDBSchema;
}
public static String getUsagestatsPermanentDBSchema() {
return ConnectDB.usagestatsPermanentDBSchema;
}
private static Connection connectHive() throws SQLException {
/*
* Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
* connection.createStatement(); log.debug("Opened database successfully"); return connection; * connection.createStatement(); log.debug("Opened database successfully"); return connection;
*/ */
ComboPooledDataSource cpds = new ComboPooledDataSource(); ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbHiveUrl); cpds.setJdbcUrl(dbHiveUrl);
cpds.setAcquireIncrement(1); cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100); cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1); cpds.setMinPoolSize(1);
cpds.setInitialPoolSize(1); cpds.setInitialPoolSize(1);
cpds.setMaxIdleTime(300); cpds.setMaxIdleTime(300);
cpds.setMaxConnectionAge(36000); cpds.setMaxConnectionAge(36000);
cpds.setAcquireRetryAttempts(30); cpds.setAcquireRetryAttempts(30);
cpds.setAcquireRetryDelay(2000); cpds.setAcquireRetryDelay(2000);
cpds.setBreakAfterAcquireFailure(false); cpds.setBreakAfterAcquireFailure(false);
cpds.setCheckoutTimeout(0); cpds.setCheckoutTimeout(0);
cpds.setPreferredTestQuery("SELECT 1"); cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60); cpds.setIdleConnectionTestPeriod(60);
return cpds.getConnection(); return cpds.getConnection();
} }
private static Connection connectImpala() throws SQLException { private static Connection connectImpala() throws SQLException {
/* /*
* Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
* connection.createStatement(); log.debug("Opened database successfully"); return connection; * connection.createStatement(); log.debug("Opened database successfully"); return connection;
*/ */
ComboPooledDataSource cpds = new ComboPooledDataSource(); ComboPooledDataSource cpds = new ComboPooledDataSource();
cpds.setJdbcUrl(dbImpalaUrl); cpds.setJdbcUrl(dbImpalaUrl);
cpds.setAcquireIncrement(1); cpds.setAcquireIncrement(1);
cpds.setMaxPoolSize(100); cpds.setMaxPoolSize(100);
cpds.setMinPoolSize(1); cpds.setMinPoolSize(1);
cpds.setInitialPoolSize(1); cpds.setInitialPoolSize(1);
cpds.setMaxIdleTime(300); cpds.setMaxIdleTime(300);
cpds.setMaxConnectionAge(36000); cpds.setMaxConnectionAge(36000);
cpds.setAcquireRetryAttempts(30); cpds.setAcquireRetryAttempts(30);
cpds.setAcquireRetryDelay(2000); cpds.setAcquireRetryDelay(2000);
cpds.setBreakAfterAcquireFailure(false); cpds.setBreakAfterAcquireFailure(false);
cpds.setCheckoutTimeout(0); cpds.setCheckoutTimeout(0);
cpds.setPreferredTestQuery("SELECT 1"); cpds.setPreferredTestQuery("SELECT 1");
cpds.setIdleConnectionTestPeriod(60); cpds.setIdleConnectionTestPeriod(60);
return cpds.getConnection(); return cpds.getConnection();
} }
} }

View File

@ -3,6 +3,7 @@
* To change this template file, choose Tools | Templates * To change this template file, choose Tools | Templates
* and open the template in the editor. * and open the template in the editor.
*/ */
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
@ -11,162 +12,142 @@ import java.util.Date;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.log4j.BasicConfigurator; import org.apache.log4j.BasicConfigurator;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/** /**
* @author D. Pierrakos, S. Zoupanos * @author D. Pierrakos, S. Zoupanos
*/ */
public class ExecuteWorkflow { public class ExecuteWorkflow {
static String matomoAuthToken; // static String matomoAuthToken;
static String matomoBaseURL; static String matomoBaseURL;
static String repoLogPath; static String repoLogPath;
static String portalLogPath; static String portalLogPath;
static String portalMatomoID; static String portalMatomoID;
static String irusUKBaseURL; // static String irusUKBaseURL;
static String irusUKReportPath; static String irusUKReportPath;
static String sarcsReportPathArray; static String sarcsReportPathArray;
static String sarcsReportPathNonArray; static String sarcsReportPathNonArray;
static String lareferenciaLogPath; static String lareferenciaLogPath;
static String lareferenciaBaseURL; // static String lareferenciaBaseURL;
static String lareferenciaAuthToken; // static String lareferenciaAuthToken;
static String dbHiveUrl; static String dbHiveUrl;
static String dbImpalaUrl; static String dbImpalaUrl;
static String usageRawDataDBSchema; static String usageRawDataDBSchema;
static String usageStatsDBSchema; static String usageStatsDBSchema;
static String statsDBSchema; static String usagestatsPermanentDBSchema;
static boolean recreateDbAndTables; static String statsDBSchema;
static boolean recreateDbAndTables;
static boolean piwikEmptyDirs; static boolean processPiwikLogs;
static boolean downloadPiwikLogs; static boolean processLaReferenciaLogs;
static boolean processPiwikLogs;
static Calendar startingLogPeriod; static boolean irusProcessStats;
static Calendar endingLogPeriod;
static int numberOfPiwikIdsToDownload;
static int numberOfSiteIdsToDownload;
static boolean laReferenciaEmptyDirs; static boolean sarcProcessStats;
static boolean downloadLaReferenciaLogs;
static boolean processLaReferenciaLogs;
static boolean irusCreateTablesEmptyDirs; static boolean finalizeStats;
static boolean irusDownloadReports; static boolean finalTablesVisibleToImpala;
static boolean irusProcessStats;
static int irusNumberOfOpendoarsToDownload;
static boolean sarcCreateTablesEmptyDirs; static int numberOfDownloadThreads;
static boolean sarcDownloadReports;
static boolean sarcProcessStats;
static int sarcNumberOfIssnToDownload;
static boolean finalizeStats; private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
static boolean finalTablesVisibleToImpala;
static int numberOfDownloadThreads; public static void main(String args[]) throws Exception {
private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); // Sending the logs to the console
BasicConfigurator.configure();
public static void main(String args[]) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
UsageStatsExporter.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json")));
parser.parseArgument(args);
// Sending the logs to the console // Setting up the initial parameters
BasicConfigurator.configure(); // matomoAuthToken = parser.get("matomoAuthToken");
// matomoBaseURL = parser.get("matomoBaseURL");
repoLogPath = parser.get("repoLogPath");
portalLogPath = parser.get("portalLogPath");
portalMatomoID = parser.get("portalMatomoID");
// irusUKBaseURL = parser.get("irusUKBaseURL");
irusUKReportPath = parser.get("irusUKReportPath");
sarcsReportPathArray = parser.get("sarcsReportPathArray");
sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
lareferenciaLogPath = parser.get("lareferenciaLogPath");
// lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
// lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
final ArgumentApplicationParser parser = new ArgumentApplicationParser( dbHiveUrl = parser.get("dbHiveUrl");
IOUtils dbImpalaUrl = parser.get("dbImpalaUrl");
.toString( usageRawDataDBSchema = parser.get("usageRawDataDBSchema");
UsageStatsExporter.class usageStatsDBSchema = parser.get("usageStatsDBSchema");
.getResourceAsStream( usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema");
"/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); statsDBSchema = parser.get("statsDBSchema");
parser.parseArgument(args);
// Setting up the initial parameters if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
matomoAuthToken = parser.get("matomoAuthToken"); processPiwikLogs = true;
matomoBaseURL = parser.get("matomoBaseURL"); } else {
repoLogPath = parser.get("repoLogPath"); processPiwikLogs = false;
portalLogPath = parser.get("portalLogPath"); }
portalMatomoID = parser.get("portalMatomoID");
irusUKBaseURL = parser.get("irusUKBaseURL");
irusUKReportPath = parser.get("irusUKReportPath");
sarcsReportPathArray = parser.get("sarcsReportPathArray");
sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
lareferenciaLogPath = parser.get("lareferenciaLogPath");
lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
dbHiveUrl = parser.get("dbHiveUrl"); // String startingLogPeriodStr = parser.get("startingLogPeriod");
dbImpalaUrl = parser.get("dbImpalaUrl"); // Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); // startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
usageStatsDBSchema = parser.get("usageStatsDBSchema"); //
statsDBSchema = parser.get("statsDBSchema"); // String endingLogPeriodStr = parser.get("endingLogPeriod");
// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
processPiwikLogs = true; recreateDbAndTables = true;
} else { } else {
processPiwikLogs = false; recreateDbAndTables = false;
} }
String startingLogPeriodStr = parser.get("startingLogPeriod"); if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); processLaReferenciaLogs = true;
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); } else {
processLaReferenciaLogs = false;
}
String endingLogPeriodStr = parser.get("endingLogPeriod"); if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); irusProcessStats = true;
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); } else {
irusProcessStats = false;
}
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); sarcProcessStats = true;
} else {
sarcProcessStats = false;
}
if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { if (parser.get("finalizeStats").toLowerCase().equals("true")) {
recreateDbAndTables = true; finalizeStats = true;
} else { } else {
recreateDbAndTables = false; finalizeStats = false;
} }
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) {
finalTablesVisibleToImpala = true;
} else {
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
}
if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { UsageStatsExporter usagestatsExport = new UsageStatsExporter();
processLaReferenciaLogs = true; usagestatsExport.export();
} else { }
processLaReferenciaLogs = false;
}
if (parser.get("irusProcessStats").toLowerCase().equals("true")) { private static Calendar startingLogPeriodStr(Date date) {
irusProcessStats = true;
} else {
irusProcessStats = false;
}
irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
return calendar;
if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { }
sarcProcessStats = true;
} else {
sarcProcessStats = false;
}
sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
if (parser.get("finalizeStats").toLowerCase().equals("true")) {
finalizeStats = true;
} else {
finalizeStats = false;
}
if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) {
finalTablesVisibleToImpala = true;
} else {
numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
}
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
}
private static Calendar startingLogPeriodStr(Date date) {
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
return calendar;
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*; import java.io.*;
@ -27,45 +28,42 @@ import org.slf4j.LoggerFactory;
*/ */
public class IrusStats { public class IrusStats {
private String irusUKURL; private String irusUKURL;
private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
public IrusStats() throws Exception { public IrusStats() throws Exception {
} }
public void processIrusStats() throws Exception {
public void processIrusStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement();
Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false);
ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating irus_downloads_stats_tmp table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".irus_downloads_stats_tmp "
+ "(`source` string, "
+ "`repository_id` string, "
+ "`result_id` string, "
+ "`date` string, "
+ "`count` bigint, "
+ "`openaire` bigint)";
stmt.executeUpdate(createDownloadsStats);
logger.info("Created irus_downloads_stats_tmp table");
logger.info("Creating irus_downloads_stats_tmp table"); logger.info("Inserting into irus_downloads_stats_tmp");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp "
+ ".irus_downloads_stats_tmp " + "SELECT s.source, d.id AS repository_id, "
+ "(`source` string, " + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
+ "`repository_id` string, " + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
+ "`result_id` string, " + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ "`date` string, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "`count` bigint, " + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
+ "`openaire` bigint)"; stmt.executeUpdate(insertDStats);
stmt.executeUpdate(createDownloadsStats); logger.info("Inserted into irus_downloads_stats_tmp");
logger.info("Created irus_downloads_stats_tmp table");
logger.info("Inserting into irus_downloads_stats_tmp"); stmt.close();
String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " // ConnectDB.getHiveConnection().close();
+ "SELECT s.source, d.id AS repository_id, " }
+ "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'";
stmt.executeUpdate(insertDStats);
logger.info("Inserted into irus_downloads_stats_tmp");
stmt.close();
//ConnectDB.getHiveConnection().close();
}
} }

View File

@ -41,8 +41,6 @@ public class LaReferenciaStats {
public LaReferenciaStats() throws Exception { public LaReferenciaStats() throws Exception {
} }
public void processLogs() throws Exception { public void processLogs() throws Exception {
try { try {
logger.info("LaReferencia creating viewsStats"); logger.info("LaReferencia creating viewsStats");
@ -62,7 +60,6 @@ public class LaReferenciaStats {
} }
} }
public void viewsStats() throws Exception { public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
@ -101,7 +98,7 @@ public class LaReferenciaStats {
logger.info("Created la_views_stats_tmp table"); logger.info("Created la_views_stats_tmp table");
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); // ConnectDB.getHiveConnection().close();
} }
private void downloadsStats() throws Exception { private void downloadsStats() throws Exception {
@ -142,8 +139,7 @@ public class LaReferenciaStats {
logger.info("Created la_downloads_stats_tmp table"); logger.info("Created la_downloads_stats_tmp table");
stmt.close(); stmt.close();
//ConnectDB.getHiveConnection().close(); // ConnectDB.getHiveConnection().close();
} }
} }

View File

@ -1,22 +1,15 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*;
import java.net.URLDecoder;
import java.sql.Connection; import java.sql.Connection;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Statement; import java.sql.Statement;
import java.sql.Timestamp;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.Timestamp;
/** /**
* @author D. Pierrakos, S. Zoupanos * @author D. Pierrakos, S. Zoupanos
@ -29,37 +22,51 @@ public class PiwikStatsDB {
private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
public PiwikStatsDB() throws Exception { public PiwikStatsDB() throws Exception {
} }
public void recreateDBAndTables() throws Exception { public void recreateDBAndTables() throws Exception {
this.createDatabase(); this.createDatabase();
// The piwiklog table is not needed since it is built // The piwiklog table is not needed since it is built
// on top of JSON files // on top of JSON files
////////////this.createTmpTables(); //////////// this.createTmpTables();
} }
private void createDatabase() throws Exception { private void createDatabase() throws Exception {
// try {
//
// stmt = ConnectDB.getHiveConnection().createStatement();
//
// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
// stmt.executeUpdate(dropDatabase);
// } catch (Exception e) {
// logger.error("Failed to drop database: " + e);
// throw new Exception("Failed to drop database: " + e.toString(), e);
// }
//
try { try {
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
stmt.executeUpdate(createDatabase);
logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema());
logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
stmt.executeUpdate(dropDatabase);
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to drop database: " + e); logger.error("Failed to create database: " + e);
throw new Exception("Failed to drop database: " + e.toString(), e); throw new Exception("Failed to create database: " + e.toString(), e);
} }
try { try {
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS "
stmt.executeUpdate(createDatabase); + ConnectDB.getUsagestatsPermanentDBSchema();
stmt.executeUpdate(createPermanentDatabase);
logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema());
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to create database: " + e); logger.error("Failed to create database: " + e);
@ -67,17 +74,16 @@ public class PiwikStatsDB {
} }
} }
public void processLogs() throws Exception { public void processLogs() throws Exception {
try { try {
logger.info("ViewsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
viewsStats(); viewsStats();
logger.info("ViewsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
logger.info("DownloadsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis()));
downloadsStats(); downloadsStats();
logger.info("DownloadsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); logger.info("DownloadsStats processing ends at: " + new Timestamp(System.currentTimeMillis()));
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to process logs: " + e); logger.error("Failed to process logs: " + e);
@ -85,68 +91,68 @@ public class PiwikStatsDB {
} }
} }
public void viewsStats() throws Exception { public void viewsStats() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_views_monthly_tmp view"); logger.info("Dropping openaire_result_views_monthly_tmp view");
String drop_result_views_monthly = "DROP VIEW IF EXISTS " + String drop_result_views_monthly = "DROP VIEW IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".openaire_piwikresult_views_monthly_tmp"; + ".openaire_piwikresult_views_monthly_tmp";
stmt.executeUpdate(drop_result_views_monthly); stmt.executeUpdate(drop_result_views_monthly);
logger.info("Dropped openaire_result_views_monthly_tmp view"); logger.info("Dropped openaire_result_views_monthly_tmp view");
logger.info("Creating openaire_result_views_monthly_tmp view"); logger.info("Creating openaire_result_views_monthly_tmp view");
String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_result_views_monthly_tmp " + + ".openaire_result_views_monthly_tmp "
"AS SELECT entity_id AS id, " + + "AS SELECT entity_id, "
"COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
"AS openaire_referrer, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) "
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "AS openaire_referrer, "
"FROM " + ConnectDB.getUsageRawDataDBSchema() + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ ".piwiklog where action='action' and (source_item_type='oaItem' or " + + "FROM " + ConnectDB.getUsageRawDataDBSchema()
"source_item_type='repItem') " + + ".piwiklog where action='action' and (source_item_type='oaItem' or "
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source_item_type='repItem') "
"source ORDER BY source, entity_id"; + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), "
+ "source ORDER BY source, entity_id";
stmt.executeUpdate(create_result_views_monthly); stmt.executeUpdate(create_result_views_monthly);
logger.info("Created openaire_result_views_monthly_tmp table"); logger.info("Created openaire_result_views_monthly_tmp table");
logger.info("Dropping openaire_views_stats_tmp table"); logger.info("Dropping openaire_views_stats_tmp table");
String drop_views_stats = "DROP TABLE IF EXISTS " + String drop_views_stats = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".openaire_views_stats_tmp"; + ".openaire_views_stats_tmp";
stmt.executeUpdate(drop_views_stats); stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_views_stats_tmp table"); logger.info("Dropped openaire_views_stats_tmp table");
logger.info("Creating openaire_views_stats_tmp table"); logger.info("Creating openaire_views_stats_tmp table");
String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_views_stats_tmp " + + ".openaire_views_stats_tmp "
"AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
"max(views) AS count, max(openaire_referrer) AS openaire " + + "max(views) AS count, max(openaire_referrer) AS openaire "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
"WHERE p.source=d.piwik_id AND p.id=ro.oid " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' "
"GROUP BY d.id, ro.id, month " + + "GROUP BY d.id, ro.id, month "
"ORDER BY d.id, ro.id, month "; + "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_views_stats); stmt.executeUpdate(create_views_stats);
logger.info("Created openaire_views_stats_tmp table"); logger.info("Created openaire_views_stats_tmp table");
logger.info("Creating openaire_pageviews_stats_tmp table"); logger.info("Creating openaire_pageviews_stats_tmp table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".openaire_pageviews_stats_tmp AS SELECT " + + ".openaire_pageviews_stats_tmp AS SELECT "
"'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, "
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
"WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" + + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID
"GROUP BY d.id, ro.id, month " + + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' "
"ORDER BY d.id, ro.id, month "; + "GROUP BY d.id, ro.id, month "
+ "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(create_pageviews_stats); stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table"); logger.info("Created pageviews_stats table");
stmt.close(); stmt.close();
//ConnectDB.getHiveConnection().close(); // ConnectDB.getHiveConnection().close();
} }
private void downloadsStats() throws Exception { private void downloadsStats() throws Exception {
@ -154,152 +160,315 @@ public class PiwikStatsDB {
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Dropping openaire_result_downloads_monthly_tmp view"); logger.info("Dropping openaire_result_downloads_monthly_tmp view");
String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + String drop_result_downloads_monthly = "DROP VIEW IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".openaire_result_downloads_monthly_tmp"; + ".openaire_result_downloads_monthly_tmp";
stmt.executeUpdate(drop_result_downloads_monthly); stmt.executeUpdate(drop_result_downloads_monthly);
logger.info("Dropped openaire_result_downloads_monthly_tmp view"); logger.info("Dropped openaire_result_downloads_monthly_tmp view");
logger.info("Creating openaire_result_downloads_monthly_tmp view"); logger.info("Creating openaire_result_downloads_monthly_tmp view");
String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema()
"AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + + ".openaire_result_downloads_monthly_tmp "
"SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "AS SELECT entity_id, "
"CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id,"
"FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " + + "COUNT(entity_id) as downloads, "
"AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
"GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
"ORDER BY source, entity_id, month"; + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' "
+ "AND (source_item_type='oaItem' OR source_item_type='repItem') "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source "
+ "ORDER BY source, entity_id, month";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Created openaire_result_downloads_monthly_tmp view"); logger.info("Created openaire_result_downloads_monthly_tmp view");
logger.info("Dropping openaire_downloads_stats_tmp table"); logger.info("Dropping openaire_downloads_stats_tmp table");
String drop_views_stats = "DROP TABLE IF EXISTS " + String drop_views_stats = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".openaire_downloads_stats_tmp"; + ".openaire_downloads_stats_tmp";
stmt.executeUpdate(drop_views_stats); stmt.executeUpdate(drop_views_stats);
logger.info("Dropped openaire_downloads_stats_tmp table"); logger.info("Dropped openaire_downloads_stats_tmp table");
logger.info("Creating openaire_downloads_stats_tmp table"); logger.info("Creating openaire_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS "
"SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
"max(downloads) AS count, max(openaire_referrer) AS openaire " + + "max(downloads) AS count, max(openaire_referrer) AS openaire "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, "
ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
"WHERE p.source=d.piwik_id and p.id=ro.oid " + + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' "
"GROUP BY d.id, ro.id, month " + + "GROUP BY d.id, ro.id, month "
"ORDER BY d.id, ro.id, month "; + "ORDER BY d.id, ro.id, month ";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Created downloads_stats table"); logger.info("Created downloads_stats table");
logger.info("Dropping openaire_result_downloads_monthly_tmp view"); logger.info("Dropping openaire_result_downloads_monthly_tmp view");
sql = "DROP VIEW IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp";
logger.info("Dropped openaire_result_downloads_monthly_tmp view "); logger.info("Dropped openaire_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); stmt.close();
//ConnectDB.getHiveConnection().close(); // ConnectDB.getHiveConnection().close();
}
public void uploadOldPedocs() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
// Dropping Pedocs pedocs_views_stats_tmp table
logger.info("Dropping Pedocs pedocs_views_stats_tmp table");
String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
logger.info("Dropped pedocs_views_stats_tmp table ");
stmt.executeUpdate(sql);
// Dropping Pedocs pedocs_downloads_stats table
logger.info("Dropping pedocs_downloads_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats";
logger.info("Dropped pedocs_downloads_stats table ");
stmt.executeUpdate(sql);
// Creating Pedocs pedocs_views_stats_tmp table
logger.info("Creating Pedocs pedocs_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ "r.id as result_id,date,counter_abstract as count, 0 as openaire "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.identifier";
stmt.executeUpdate(sql);
logger.info("Created pedocs_views_stats_tmp table ");
// Creating Pedocs pedocs_downloads_stats_tmp table
logger.info("Creating Pedocs pedocs_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id,"
+ "r.id as result_id, date, counter as count, 0 as openaire "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema()
+ ".result_oids r where r.oid=p.identifier";
stmt.executeUpdate(sql);
logger.info("Created pedocs_downloads_stats_tmp table ");
}
public void uploadTUDELFTStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
logger.info("Dropped tudelft_result_views_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_views_stats_tmp table
logger.info("Dropping TUDELFT tudelft_views_stats_tmp table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
logger.info("Dropped tudelft_views_stats_tmp table ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_downloads_stats_tmp table
logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
logger.info("Dropped tudelft_downloads_stats_tmp table ");
stmt.executeUpdate(sql);
// Creating TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp "
+ "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created tudelft_result_views_monthly_tmp view ");
// Creating TUDELFT tudelft_views_stats_tmp table
logger.info("Creating TUDELFT tudelft_views_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".tudelft_result_views_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
stmt.executeUpdate(sql);
logger.info("Created TUDELFT tudelft_views_stats_tmp table");
// Creating TUDELFT tudelft_result_downloads_monthly_tmp view
logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp "
+ "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, "
+ "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, "
+ "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog "
+ "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 "
+ "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id";
stmt.executeUpdate(sql);
logger.info("Created tudelft_result_downloads_monthly_tmp view ");
// Creating TUDELFT tudelft_downloads_stats_tmp table
logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS "
+ "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, "
+ "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".tudelft_result_downloads_monthly_tmp p, "
+ ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro "
+ "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' "
+ "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id";
stmt.executeUpdate(sql);
logger.info("Created TUDELFT tudelft_downloads_stats_tmp table");
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp";
logger.info("Dropped tudelft_result_views_monthly_tmp view ");
stmt.executeUpdate(sql);
// Dropping TUDELFT tudelft_result_views_monthly_tmp view
logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view");
sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp";
logger.info("Dropped tudelft_result_downloads_monthly_tmp view ");
stmt.executeUpdate(sql);
} }
public void finalizeStats() throws Exception { public void finalizeStats() throws Exception {
stmt = ConnectDB.getHiveConnection().createStatement(); stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
//Dropping views_stats table // Dropping views_stats table
logger.info("Dropping views_stats table"); logger.info("Dropping views_stats table");
String sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".views_stats"; String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
logger.info("Dropped views_stats table "); logger.info("Dropped views_stats table ");
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
//Dropping downloads_stats table // Dropping downloads_stats table
logger.info("Dropping downloads_stats table"); logger.info("Dropping downloads_stats table");
sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
logger.info("Dropped downloads_stats table "); logger.info("Dropped downloads_stats table ");
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
//Dropping page_views_stats table // Dropping page_views_stats table
logger.info("Dropping pageviews_stats table"); logger.info("Dropping pageviews_stats table");
sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
logger.info("Dropped pageviews_stats table "); logger.info("Dropped pageviews_stats table ");
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
//Creating views_stats table // Dropping usage_stats table
logger.info("Dropping usage_stats table");
sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
logger.info("Dropped usage_stats table ");
stmt.executeUpdate(sql);
// Creating views_stats table
logger.info("Creating views_stats table"); logger.info("Creating views_stats table");
String createViewsStats = "CREATE TABLE IF NOT EXISTS " + String createViewsStats = "CREATE TABLE IF NOT EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".views_stats " + + ".views_stats "
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createViewsStats); stmt.executeUpdate(createViewsStats);
logger.info("Created views_stats table"); logger.info("Created views_stats table");
//Inserting OpenAIRE views stats
logger.info("Inserting Openaire data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Openaire views updated to views_stats");
//Inserting Lareferencia views stats // Inserting OpenAIRE views stats
logger.info("Inserting LaReferencia data to views_stats"); logger.info("Inserting Openaire data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("LaReferencia views updated to views_stats"); logger.info("Openaire views updated to views_stats");
// Inserting Pedocs old views stats
logger.info("Inserting Pedocs old data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Pedocs views updated to views_stats");
// Inserting TUDELFT views stats
logger.info("Inserting TUDELFT data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("TUDELFT views updated to views_stats");
// Inserting Lareferencia views stats
logger.info("Inserting LaReferencia data to views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp";
stmt.executeUpdate(sql);
logger.info("LaReferencia views updated to views_stats");
logger.info("Creating downloads_stats table"); logger.info("Creating downloads_stats table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".downloads_stats " + + ".downloads_stats "
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(createDownloadsStats); stmt.executeUpdate(createDownloadsStats);
logger.info("Created downloads_stats table"); logger.info("Created downloads_stats table");
//Inserting OpenAIRE downloads stats // Inserting OpenAIRE downloads stats
logger.info("Inserting OpenAIRE data to downloads_stats"); logger.info("Inserting OpenAIRE data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Inserted OpenAIRE data to downloads_stats");
//Inserting Lareferencia downloads stats // Inserting Pedocs old downloads stats
logger.info("Inserting PeDocs old data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted Pedocs data to downloads_stats");
// Inserting TUDELFT downloads stats
logger.info("Inserting TUDELFT old data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("Inserted TUDELFT data to downloads_stats");
// Inserting Lareferencia downloads stats
logger.info("Inserting LaReferencia data to downloads_stats"); logger.info("Inserting LaReferencia data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Lareferencia downloads updated to downloads_stats"); logger.info("Lareferencia downloads updated to downloads_stats");
//Inserting IRUS downloads stats
logger.info("Inserting IRUS data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " +
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("IRUS downloads updated to downloads_stats");
//Inserting SARC-OJS downloads stats // Inserting IRUS downloads stats
logger.info("Inserting SARC data to downloads_stats"); logger.info("Inserting IRUS data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("SARC-OJS downloads updated to downloads_stats"); logger.info("IRUS downloads updated to downloads_stats");
// Inserting SARC-OJS downloads stats
logger.info("Creating pageviews_stats table"); logger.info("Inserting SARC data to downloads_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats "
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp";
stmt.executeUpdate(sql);
logger.info("SARC-OJS downloads updated to downloads_stats");
logger.info("Creating pageviews_stats table");
String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".pageviews_stats " + + ".pageviews_stats "
"LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET";
stmt.executeUpdate(create_pageviews_stats); stmt.executeUpdate(create_pageviews_stats);
logger.info("Created pageviews_stats table"); logger.info("Created pageviews_stats table");
//Inserting OpenAIRE views stats from Portal // Inserting OpenAIRE views stats from Portal
logger.info("Inserting data to page_views_stats"); logger.info("Inserting data to page_views_stats");
sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats "
"SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Dropping full_dates table"); logger.info("Dropping full_dates table");
String dropFullDates = "DROP TABLE IF EXISTS " + String dropFullDates = "DROP TABLE IF EXISTS "
ConnectDB.getUsageStatsDBSchema() + + ConnectDB.getUsageStatsDBSchema()
".full_dates"; + ".full_dates";
stmt.executeUpdate(dropFullDates); stmt.executeUpdate(dropFullDates);
logger.info("Dropped full_dates table"); logger.info("Dropped full_dates table");
@ -310,35 +479,80 @@ public class PiwikStatsDB {
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
logger.info("Creating full_dates table"); logger.info("Creating full_dates table");
sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS "
"SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date "
"FROM (SELECT DATE '2016-01-01' AS from_date) p " + + "FROM (SELECT DATE '2016-01-01' AS from_date) p "
"LATERAL VIEW " + + "LATERAL VIEW "
"posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Created full_dates table"); logger.info("Created full_dates table");
logger.info("Inserting data to usage_stats"); logger.info("Inserting data to usage_stats");
sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS "
"SELECT coalesce(ds.source, vs.source) as source, " + + "SELECT coalesce(ds.source, vs.source) as source, "
"coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, "
"coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, "
"coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, "
"coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(ds.openaire, 0) as openaire_downloads, "
"coalesce(vs.openaire, 0) as openaire_views " + + "coalesce(vs.openaire, 0) as openaire_views "
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN "
ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source "
"AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
logger.info("Inserted data to usage_stats"); logger.info("Inserted data to usage_stats");
logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis()));
logger.info("Dropping view views_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view views_stats on permanent usagestats DB");
logger.info("Create view views_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
logger.info("Created view views_stats on permanent usagestats DB");
logger.info("Dropping view pageviews_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view pageviews_stats on permanent usagestats DB");
logger.info("Create view pageviews_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
logger.info("Created view pageviews_stats on permanent usagestats DB");
logger.info("Dropping view downloads_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on downloads_stats on permanent usagestats DB");
logger.info("Create view on downloads_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
logger.info("Created view on downloads_stats on permanent usagestats DB");
logger.info("Dropping view usage_stats on permanent usagestats DB");
sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
logger.info("Dropped view on usage_stats on permanent usagestats DB");
logger.info("Create view on usage_stats on permanent usagestats DB");
sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"
+ " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
logger.info("Created view on usage_stats on permanent usagestats DB");
logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis()));
stmt.close(); stmt.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
} }
private Connection getConnection() throws SQLException { private Connection getConnection() throws SQLException {
return ConnectDB.getHiveConnection(); return ConnectDB.getHiveConnection();
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.*; import java.io.*;
@ -33,74 +34,74 @@ import org.slf4j.LoggerFactory;
*/ */
public class SarcStats { public class SarcStats {
private Statement stmtHive = null; private Statement stmtHive = null;
private Statement stmtImpala = null; private Statement stmtImpala = null;
private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
public SarcStats() throws Exception { public SarcStats() throws Exception {
// createTables(); // createTables();
} }
private void createTables() throws Exception { private void createTables() throws Exception {
try { try {
stmtHive = ConnectDB.getHiveConnection().createStatement(); stmtHive = ConnectDB.getHiveConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
stmtHive.executeUpdate(sqlCreateTableSushiLog); stmtHive.executeUpdate(sqlCreateTableSushiLog);
// String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
// stmt.executeUpdate(sqlCopyPublicSushiLog); // stmt.executeUpdate(sqlCopyPublicSushiLog);
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO sushilog " + " ON INSERT TO sushilog "
+ " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ "sushilog.rid, sushilog.date " + "sushilog.rid, sushilog.date "
+ "FROM sushilog " + "FROM sushilog "
+ "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
stmtHive.executeUpdate(sqlcreateRuleSushiLog); stmtHive.executeUpdate(sqlcreateRuleSushiLog);
String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
stmtHive.executeUpdate(createSushiIndex); stmtHive.executeUpdate(createSushiIndex);
stmtHive.close(); stmtHive.close();
ConnectDB.getHiveConnection().close(); ConnectDB.getHiveConnection().close();
logger.info("Sushi Tables Created"); logger.info("Sushi Tables Created");
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to create tables: " + e); logger.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e); throw new Exception("Failed to create tables: " + e.toString(), e);
} }
} }
public void processSarc() throws Exception { public void processSarc() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();
ConnectDB.getHiveConnection().setAutoCommit(false); ConnectDB.getHiveConnection().setAutoCommit(false);
logger.info("Creating sarc_downloads_stats_tmp table"); logger.info("Creating sarc_downloads_stats_tmp table");
String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ ".sarc_downloads_stats_tmp " + ".sarc_downloads_stats_tmp "
+ "(`source` string, " + "(`source` string, "
+ "`repository_id` string, " + "`repository_id` string, "
+ "`result_id` string, " + "`result_id` string, "
+ "`date` string, " + "`date` string, "
+ "`count` bigint, " + "`count` bigint, "
+ "`openaire` bigint)"; + "`openaire` bigint)";
stmt.executeUpdate(createDownloadsStats); stmt.executeUpdate(createDownloadsStats);
logger.info("Created sarc_downloads_stats_tmp table"); logger.info("Created sarc_downloads_stats_tmp table");
logger.info("Inserting into sarc_downloads_stats_tmp"); logger.info("Inserting into sarc_downloads_stats_tmp");
String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp "
+ "SELECT s.source, d.id AS repository_id, " + "SELECT s.source, d.id AS repository_id, "
+ "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', "
+ "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' "
+ "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, "
+ ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".datasource_oids d, "
+ ConnectDB.getStatsDBSchema() + ".result_pids ro " + ConnectDB.getStatsDBSchema() + ".result_pids ro "
+ "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') "
+ "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'";
stmt.executeUpdate(insertSarcStats); stmt.executeUpdate(insertSarcStats);
logger.info("Inserted into sarc_downloads_stats_tmp"); logger.info("Inserted into sarc_downloads_stats_tmp");
stmt.close(); stmt.close();
//ConnectDB.getHiveConnection().close(); // ConnectDB.getHiveConnection().close();
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestatsbuild.export; package eu.dnetlib.oa.graph.usagestatsbuild.export;
import java.io.IOException; import java.io.IOException;
@ -17,90 +18,110 @@ import org.slf4j.LoggerFactory;
*/ */
public class UsageStatsExporter { public class UsageStatsExporter {
public UsageStatsExporter() { public UsageStatsExporter() {
} }
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
public void export() throws Exception { public void export() throws Exception {
logger.info("Initialising DB properties"); logger.info("Initialising DB properties");
ConnectDB.init(); ConnectDB.init();
// runImpalaQuery(); // runImpalaQuery();
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); PiwikStatsDB piwikstatsdb = new PiwikStatsDB();
logger.info("Re-creating database and tables"); logger.info("Re-creating database and tables");
if (ExecuteWorkflow.recreateDbAndTables) { if (ExecuteWorkflow.recreateDbAndTables) {
piwikstatsdb.recreateDBAndTables(); piwikstatsdb.recreateDBAndTables();
logger.info("DB-Tables are created "); logger.info("DB-Tables are created ");
} }
// else { // else {
// piwikstatsdb.createTmpTables(); // piwikstatsdb.createTmpTables();
// logger.info("TmpTables are created "); // logger.info("TmpTables are created ");
// } // }
if (ExecuteWorkflow.processPiwikLogs) { if (ExecuteWorkflow.processPiwikLogs) {
logger.info("Processing logs"); logger.info("Processing Piwik logs");
piwikstatsdb.processLogs(); piwikstatsdb.processLogs();
} logger.info("Piwik logs Done");
logger.info("Processing Pedocs Old Stats");
piwikstatsdb.uploadOldPedocs();
logger.info("Processing Pedocs Old Stats Done");
logger.info("Processing TUDELFT Stats");
piwikstatsdb.uploadTUDELFTStats();
logger.info("Processing TUDELFT Stats Done");
LaReferenciaStats lastats = new LaReferenciaStats(); }
if (ExecuteWorkflow.processLaReferenciaLogs) { LaReferenciaStats lastats = new LaReferenciaStats();
logger.info("Processing LaReferencia logs");
lastats.processLogs();
logger.info("LaReferencia logs done");
}
IrusStats irusstats = new IrusStats();
if (ExecuteWorkflow.irusProcessStats) {
logger.info("Processing IRUS");
irusstats.processIrusStats();
logger.info("Irus done");
}
SarcStats sarcStats = new SarcStats(); if (ExecuteWorkflow.processLaReferenciaLogs) {
logger.info("Processing LaReferencia logs");
lastats.processLogs();
logger.info("LaReferencia logs done");
}
if (ExecuteWorkflow.sarcProcessStats) { IrusStats irusstats = new IrusStats();
sarcStats.processSarc();
}
logger.info("Sarc done");
// finalize usagestats if (ExecuteWorkflow.irusProcessStats) {
if (ExecuteWorkflow.finalizeStats) { logger.info("Processing IRUS");
piwikstatsdb.finalizeStats(); irusstats.processIrusStats();
logger.info("Finalized stats"); logger.info("Irus done");
} }
// Make the tables available to Impala SarcStats sarcStats = new SarcStats();
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
logger.info("End"); if (ExecuteWorkflow.sarcProcessStats) {
} sarcStats.processSarc();
}
logger.info("Sarc done");
private void invalidateMetadata() throws SQLException { // finalize usagestats
Statement stmt = null; if (ExecuteWorkflow.finalizeStats) {
piwikstatsdb.finalizeStats();
logger.info("Finalized stats");
}
stmt = ConnectDB.getImpalaConnection().createStatement(); // Make the tables available to Impala
if (ExecuteWorkflow.finalTablesVisibleToImpala) {
logger.info("Making tables visible to Impala");
invalidateMetadata();
}
String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; logger.info("End");
stmt.executeUpdate(sql); }
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; private void invalidateMetadata() throws SQLException {
stmt.executeUpdate(sql); Statement stmt = null;
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; stmt = ConnectDB.getImpalaConnection().createStatement();
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql); stmt.executeUpdate(sql);
stmt.close(); sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats";
ConnectDB.getHiveConnection().close(); stmt.executeUpdate(sql);
}
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats";
stmt.executeUpdate(sql);
sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getHiveConnection().close();
}
} }

View File

@ -1,237 +1,128 @@
[ [
{
"paramName": "mat",
"paramLongName": "matomoAuthToken",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "mbu",
"paramLongName": "matomoBaseURL",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "rlp",
"paramLongName": "repoLogPath",
"paramDescription": "nameNode of the source cluster",
"paramRequired": true
},
{
"paramName": "plp",
"paramLongName": "portalLogPath",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "pmi",
"paramLongName": "portalMatomoID",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "iukbuw",
"paramLongName": "irusUKBaseURL",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "iukrp",
"paramLongName": "irusUKReportPath",
"paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true
},
{
"paramName": "srpa",
"paramLongName": "sarcsReportPathArray",
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true
},
{
"paramName": "srpna",
"paramLongName": "sarcsReportPathNonArray",
"paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true
},
{
"paramName": "llp",
"paramLongName": "lareferenciaLogPath",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "lbu",
"paramLongName": "lareferenciaBaseURL",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "lat",
"paramLongName": "lareferenciaAuthToken",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "dbhu",
"paramLongName": "dbHiveUrl",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "dbiu",
"paramLongName": "dbImpalaUrl",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "urdbs",
"paramLongName": "usageRawDataDBSchema",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{ {
"paramName": "usdbs", "paramName": "rlp",
"paramLongName": "usageStatsDBSchema", "paramLongName": "repoLogPath",
"paramDescription": "activate tranform-only mode. Only apply transformation step", "paramDescription": "nameNode of the source cluster",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "sdbs", "paramName": "plp",
"paramLongName": "statsDBSchema", "paramLongName": "portalLogPath",
"paramDescription": "activate tranform-only mode. Only apply transformation step", "paramDescription": "namoNode of the target cluster",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "rdbt", "paramName": "pmi",
"paramLongName": "recreateDbAndTables", "paramLongName": "portalMatomoID",
"paramDescription": "Re-create database and initial tables?", "paramDescription": "namoNode of the target cluster",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "pwed", "paramName": "iukrp",
"paramLongName": "piwikEmptyDirs", "paramLongName": "irusUKReportPath",
"paramDescription": "Empty piwik directories?", "paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "ppwl", "paramName": "srpa",
"paramLongName": "processPiwikLogs", "paramLongName": "sarcsReportPathArray",
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", "paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "dpwl", "paramName": "srpna",
"paramLongName": "downloadPiwikLogs", "paramLongName": "sarcsReportPathNonArray",
"paramDescription": "download piwik logs?", "paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "slp", "paramName": "llp",
"paramLongName": "startingLogPeriod", "paramLongName": "lareferenciaLogPath",
"paramDescription": "Starting log period", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "elp", "paramName": "dbhu",
"paramLongName": "endingLogPeriod", "paramLongName": "dbHiveUrl",
"paramDescription": "Ending log period", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "npidd", "paramName": "dbiu",
"paramLongName": "numberOfPiwikIdsToDownload", "paramLongName": "dbImpalaUrl",
"paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "nsidd", "paramName": "urdbs",
"paramLongName": "numberOfSiteIdsToDownload", "paramLongName": "usageRawDataDBSchema",
"paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "lerd", "paramName": "usdbs",
"paramLongName": "laReferenciaEmptyDirs", "paramLongName": "usageStatsDBSchema",
"paramDescription": "Empty LaReferencia directories?", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "plrl", "paramName": "sdbs",
"paramLongName": "processLaReferenciaLogs", "paramLongName": "statsDBSchema",
"paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "dlrl", "paramName": "uspdbs",
"paramLongName": "downloadLaReferenciaLogs", "paramLongName": "usagestatsPermanentDBSchema",
"paramDescription": "download La Referencia logs?", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "icted", "paramName": "rdbt",
"paramLongName": "irusCreateTablesEmptyDirs", "paramLongName": "recreateDbAndTables",
"paramDescription": "Irus section: Create tables and empty JSON directories?", "paramDescription": "Re-create database and initial tables?",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "idr", "paramName": "ppwl",
"paramLongName": "irusDownloadReports", "paramLongName": "processPiwikLogs",
"paramDescription": "Irus section: Download reports?", "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "ipr", "paramName": "plrl",
"paramLongName": "irusProcessStats", "paramLongName": "processLaReferenciaLogs",
"paramDescription": "Irus section: Process stats?", "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "inod", "paramName": "ipr",
"paramLongName": "irusNumberOfOpendoarsToDownload", "paramLongName": "irusProcessStats",
"paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", "paramDescription": "Irus section: Process stats?",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "icted", "paramName": "ipr",
"paramLongName": "sarcCreateTablesEmptyDirs", "paramLongName": "sarcProcessStats",
"paramDescription": "Sarc section: Create tables and empty JSON directories?", "paramDescription": "Sarc section: Process stats?",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "idr", "paramName": "fs",
"paramLongName": "sarcDownloadReports", "paramLongName": "finalizeStats",
"paramDescription": "Sarc section: Download reports?", "paramDescription": "Create the usage_stats table?",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "ipr", "paramName": "ftvi",
"paramLongName": "sarcProcessStats", "paramLongName": "finalTablesVisibleToImpala",
"paramDescription": "Sarc section: Process stats?", "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "inod", "paramName": "nodt",
"paramLongName": "sarcNumberOfIssnToDownload", "paramLongName": "numberOfDownloadThreads",
"paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", "paramDescription": "Number of download threads",
"paramRequired": true "paramRequired": true
}, }
{
"paramName": "fs",
"paramLongName": "finalizeStats",
"paramDescription": "Create the usage_stats table?",
"paramRequired": true
},
{
"paramName": "ftvi",
"paramLongName": "finalTablesVisibleToImpala",
"paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala",
"paramRequired": true
},
{
"paramName": "nodt",
"paramLongName": "numberOfDownloadThreads",
"paramDescription": "Number of download threads",
"paramRequired": true
}
] ]

View File

@ -42,42 +42,24 @@
<action name='Step1'> <action name='Step1'>
<java> <java>
<main-class>eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow</main-class> <main-class>eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow</main-class>
<arg>--matomoAuthToken</arg><arg>${matomoAuthToken}</arg>
<arg>--matomoBaseURL</arg><arg>${matomoBaseURL}</arg>
<arg>--repoLogPath</arg><arg>${repoLogPath}</arg> <arg>--repoLogPath</arg><arg>${repoLogPath}</arg>
<arg>--portalLogPath</arg><arg>${portalLogPath}</arg> <arg>--portalLogPath</arg><arg>${portalLogPath}</arg>
<arg>--portalMatomoID</arg><arg>${portalMatomoID}</arg> <arg>--portalMatomoID</arg><arg>${portalMatomoID}</arg>
<arg>--irusUKBaseURL</arg><arg>${irusUKBaseURL}</arg>
<arg>--irusUKReportPath</arg><arg>${irusUKReportPath}</arg> <arg>--irusUKReportPath</arg><arg>${irusUKReportPath}</arg>
<arg>--sarcsReportPathArray</arg><arg>${sarcsReportPathArray}</arg> <arg>--sarcsReportPathArray</arg><arg>${sarcsReportPathArray}</arg>
<arg>--sarcsReportPathNonArray</arg><arg>${sarcsReportPathNonArray}</arg> <arg>--sarcsReportPathNonArray</arg><arg>${sarcsReportPathNonArray}</arg>
<arg>--lareferenciaLogPath</arg><arg>${lareferenciaLogPath}</arg> <arg>--lareferenciaLogPath</arg><arg>${lareferenciaLogPath}</arg>
<arg>--lareferenciaBaseURL</arg><arg>${lareferenciaBaseURL}</arg>
<arg>--lareferenciaAuthToken</arg><arg>${lareferenciaAuthToken}</arg>
<arg>--dbHiveUrl</arg><arg>${hiveJdbcUrl}</arg> <arg>--dbHiveUrl</arg><arg>${hiveJdbcUrl}</arg>
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg> <arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
<arg>--usageRawDataDBSchema</arg><arg>${usageRawDataDBSchema}</arg> <arg>--usageRawDataDBSchema</arg><arg>${usageRawDataDBSchema}</arg>
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg> <arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
<arg>--usagestatsPermanentDBSchema</arg><arg>${usagestatsPermanentDBSchema}</arg>
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg> <arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg> <arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
<arg>--piwikEmptyDirs</arg><arg>${piwikEmptyDirs}</arg>
<arg>--downloadPiwikLogs</arg><arg>${downloadPiwikLogs}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg> <arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--laReferenciaEmptyDirs</arg><arg>${laReferenciaEmptyDirs}</arg>
<arg>--downloadLaReferenciaLogs</arg><arg>${downloadLaReferenciaLogs}</arg>
<arg>--processLaReferenciaLogs</arg><arg>${processLaReferenciaLogs}</arg> <arg>--processLaReferenciaLogs</arg><arg>${processLaReferenciaLogs}</arg>
<arg>--irusCreateTablesEmptyDirs</arg><arg>${irusCreateTablesEmptyDirs}</arg>
<arg>--irusDownloadReports</arg><arg>${irusDownloadReports}</arg>
<arg>--irusProcessStats</arg><arg>${irusProcessStats}</arg> <arg>--irusProcessStats</arg><arg>${irusProcessStats}</arg>
<arg>--irusNumberOfOpendoarsToDownload</arg><arg>${irusNumberOfOpendoarsToDownload}</arg>
<arg>--sarcCreateTablesEmptyDirs</arg><arg>${sarcCreateTablesEmptyDirs}</arg>
<arg>--sarcDownloadReports</arg><arg>${sarcDownloadReports}</arg>
<arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg> <arg>--sarcProcessStats</arg><arg>${sarcProcessStats}</arg>
<arg>--sarcNumberOfIssnToDownload</arg><arg>${sarcNumberOfIssnToDownload}</arg>
<arg>--finalizeStats</arg><arg>${finalizeStats}</arg> <arg>--finalizeStats</arg><arg>${finalizeStats}</arg>
<arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg> <arg>--finalTablesVisibleToImpala</arg><arg>${finalTablesVisibleToImpala}</arg>
<arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg> <arg>--numberOfDownloadThreads</arg><arg>${numberOfDownloadThreads}</arg>