diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml new file mode 100644 index 000000000..a65c4514a --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml @@ -0,0 +1,18 @@ + + + + + + JDK_1.8 + + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index 14b543a57..5593d4d87 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -23,7 +23,35 @@ 4.0.0 dhp-usage-datasets-stats-update - + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + UTF-8 UTF-8 @@ -68,6 +96,11 @@ dhp-common ${project.version} + + com.mchange + c3p0 + 0.9.5.2 + c3p0 c3p0 diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh new file mode 100755 index 000000000..9b4325508 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index e6da7eff3..25b30e8ad 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -32,8 +32,8 @@ public abstract class ConnectDB { private static String datasetUsageStatsDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); - private Statement stmt = null; - + private Statement stmt = null; + static void init() throws ClassNotFoundException { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; @@ -79,6 +79,7 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); + cpds.setUser("dimitris.pierrakos"); cpds.setAcquireIncrement(1); cpds.setMaxPoolSize(100); cpds.setMinPoolSize(1); @@ -93,10 +94,10 @@ public abstract class ConnectDB { cpds.setCheckoutTimeout(0); cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - return cpds.getConnection(); + logger.info("Opened database successfully"); + + return cpds.getConnection(); } @@ -107,6 +108,7 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); + cpds.setUser("dimitris.pierrakos"); cpds.setAcquireIncrement(1); cpds.setMaxPoolSize(100); cpds.setMinPoolSize(1); @@ -122,81 +124,8 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Piwiklog table - This table should exist - String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) " - + "into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePiwikLog); - - ///////////////////////////////////////// - // Rule for duplicate inserts @ piwiklog - ///////////////////////////////////////// - - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log - ////////////////////////////////////////////////// - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } } -/* -CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING, - name STRING, - source STRING, - release STRING, - createdby STRING, - report_end_date STRING, - report_start_date STRING) - CLUSTERED BY (reportid) - into 100 buckets stored as orc tblproperties('transactional'='true'); -*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java new file mode 100644 index 000000000..88db1f819 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -0,0 +1,168 @@ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class DatasetsStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + } + +// public void reCreateLogDirs() throws IllegalArgumentException, IOException { +// FileSystem dfs = FileSystem.get(new Configuration()); +// +// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); +// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); +// +// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); +// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); +// +// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); +// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); +// +// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); +// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); +// } + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Reports table - This table should exist + logger.info("Creating Reports Table"); + String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports(reportid STRING, \n" + + " name STRING, \n" + + " source STRING,\n" + + " release STRING,\n" + + " createdby STRING,\n" + + " report_start_date STRING,\n" + + " report_end_date STRING)\n" + + " CLUSTERED BY (reportid)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + + stmt.executeUpdate(sqlCreateTableDataciteReports); + logger.info("Reports Table Created"); + + // Create Datasets Table + logger.info("Creating DataSets Table"); + String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasets(ds_type STRING,\n" + + " ds_title STRING,\n" + + " yop STRING,\n" + + " uri STRING,\n" + + " platform STRING,\n" + + " data_type STRING,\n" + + " publisher STRING,\n" + + " publisher_id_type STRING,\n" + + " publisher_id_value STRING,\n" + + " ds_dates_type STRING,\n" + + " ds_pub_date STRING,\n" + + " ds_contributors STRING,\n" + // + " ds_contributor_value array ,\n" + + " reportid STRING)\n" + + " CLUSTERED BY (ds_type)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableDataSets); + logger.info("DataSets Table Created"); + + // Create Datasets Performance Table + logger.info("Creating DataSetsPerformance Table"); + String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance(ds_type STRING,\n" + + " period_end STRING,\n" + + " period_from STRING,\n" + + " access_method STRING,\n" + + " metric_type STRING,\n" + + " count INT,\n" + + " country_counts STRING,\n" + + " reportid STRING)\n" + + " CLUSTERED BY (ds_type)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableDataSetsPerformance); + logger.info("DataSetsPerformance Table Created"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java index 196238ea2..a73b299ec 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java @@ -1,97 +1,102 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import com.google.gson.JsonObject; -import java.util.ArrayList; -import java.util.Iterator; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * - * @author dpie - */ -public class DownloadReportsListFromDatacite { - - private String dataciteBaseURL; - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception { - - this.dataciteBaseURL = dataciteBaseURL; - this.dataciteReportPath = dataciteReportPath; - } - - public void downloadReportsList() throws ParseException { - StringBuilder responseStrBuilder = new StringBuilder(); - - Gson gson = new Gson(); - - try { - BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); - logger.info("Downloading from " + dataciteBaseURL); - - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder.append(inputStr); - } - } catch (IOException e) { - logger.info(e.getMessage()); - } - JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); - JsonArray dataArray = jsonObject.getAsJsonArray("reports"); - ArrayList reportsList = new ArrayList(); - for (JsonElement element : dataArray) { - reportsList.add(element.getAsJsonObject().get("id").getAsString()); - } - - Iterator it = reportsList.iterator(); - while (it.hasNext()) { - String reportId = it.next().toString(); - String url = dataciteBaseURL + reportId; - - try { - BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - StringBuilder responseStrBuilder2 = new StringBuilder(); - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder2.append(inputStr); - } - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"), - true); - byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - fin.close(); - - fin.close(); - } catch (IOException e) { - System.out.println(e); - } - } - } -} +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +/** + * @author dpie + */ +public class DownloadReportsListFromDatacite { + + private String dataciteBaseURL; + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) + throws MalformedURLException, Exception { + + this.dataciteBaseURL = dataciteBaseURL; + this.dataciteReportPath = dataciteReportPath; + } + + public void downloadReportsList() throws ParseException { + StringBuilder responseStrBuilder = new StringBuilder(); + + Gson gson = new Gson(); + + try { + BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); + logger.info("Downloading from " + dataciteBaseURL); + + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder.append(inputStr); + } + } catch (IOException e) { + logger.info(e.getMessage()); + } + JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); + JsonArray dataArray = jsonObject.getAsJsonArray("reports"); + ArrayList reportsList = new ArrayList(); + for (JsonElement element : dataArray) { + reportsList.add(element.getAsJsonObject().get("id").getAsString()); + } + + Iterator it = reportsList.iterator(); + while (it.hasNext()) { + String reportId = it.next().toString(); + String url = dataciteBaseURL + reportId; + + try { + BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + StringBuilder responseStrBuilder2 = new StringBuilder(); + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder2.append(inputStr); + } + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(dataciteReportPath + "/" + reportId + ".json"), + true); + byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + fin.close(); + + fin.close(); + } catch (IOException e) { + System.out.println(e); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index 7b3db3115..b28578e4b 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -18,14 +18,13 @@ public class ExecuteWorkflow { static String dataciteBaseURL; static String dataciteReportPath; - static String dbHiveUrl; - static String dbImpalaUrl; - static String datasetUsageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - static boolean datasetsEmptyDirs; - static boolean finalTablesVisibleToImpala; - + static String dbHiveUrl; + static String dbImpalaUrl; + static String datasetUsageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + static boolean datasetsEmptyDirs; + static boolean finalTablesVisibleToImpala; public static void main(String args[]) throws Exception { @@ -58,11 +57,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - +// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) +// finalTablesVisibleToImpala = true; +// else +// finalTablesVisibleToImpala = false; +// UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java new file mode 100644 index 000000000..ccb3eebd3 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -0,0 +1,408 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.*; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.sql.Array; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Iterator; +import java.util.Set; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +/** + * @author dpie + */ +public class ReadReportsListFromDatacite { + + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception { + + this.dataciteReportPath = dataciteReportPath; + } + + public void readReports() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + File folder = new File(dataciteReportPath); + ArrayList jsonFiles = listHdfsDir(dataciteReportPath); + for (String jsonFile : jsonFiles) { + logger.info("Reading report file " + jsonFile); + this.createTmpReportsTable(jsonFile); + + String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelectReportID); + ResultSet rstmpReportID = stmt.getResultSet(); + + String reportID = null; + while (rstmpReportID.next()) { + reportID = rstmpReportID.getString(1); + } + + logger.info("Checking report with id " + reportID); + String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports where reportid=?"; + PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); + stGetReportID.setString(1, reportID); + + ResultSet rsCheckIfReportExist = stGetReportID.executeQuery(); + + if (rsCheckIfReportExist.next()) { + logger.info("Report found with ID " + reportID); + dropTmpReportsTable(); + } else { + String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datacitereports " + + "SELECT\n" + + " get_json_object(json, '$.report.id') AS reportid,\n" + + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" + + " get_json_object(json, '$.report.report-header.report-id') AS source,\n" + + " get_json_object(json, '$.report.report-header.release') AS release,\n" + + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertReport); + + logger.info("Report added"); + + logger.info("Adding datasets"); + String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsArray); + ResultSet rstmpReportDatasets = stmt.getResultSet(); + + if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) { + String[] listDatasets = rstmpReportDatasets.getString(1).split(","); + logger.info("Datasets found " + listDatasets.length); + + for (int i = 0; i < listDatasets.length; i++) { + + String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datasets " + + "SELECT\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-id[0].value') AS ds_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-title') AS ds_title,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].publisher-id.type[0]') AS publisher_id_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].publisher-id.value[0]') AS publisher_id_value,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-dates.type[0]') AS ds_dates_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-dates.value[0]') AS ds_dates_value,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-contributors') AS ds_contributors,\n" + + " get_json_object(json, '$.report.id') AS reportid \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertDataSets); + + logger.info("Dataset added " + i); + + logger.info("Adding Dataset Performance"); + String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets[" + + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsPerformance); + ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet(); + if (rstmpReportDatasetsPerformance.next() + && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) { + String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(","); + logger.info("Datasets Performance found " + listDatasetsPerformance.length); + for (int j = 0; j < listDatasetsPerformance.length; j++) { + String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets[" + + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsPerformanceInstance); + ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet(); + if (rstmpReportDatasetsPerformanceInstance.next() + && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) { + String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance + .getString(1) + .split(","); + logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length); + for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) { + String sqlInsertDataSetsPerformance = "INSERT INTO " + + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance " + + "SELECT\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-id[0].value') AS ds_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].period.end-date') AS period_end,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].period.begin-date') AS period_from,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].access-method') AS access_method,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].metric-type') AS metric_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + "].count') AS count,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].country-counts') AS country_counts,\n" + + " get_json_object(json, '$.report.id') AS reportid \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertDataSetsPerformance); + } + } + } + } + logger.info("DatasetPerformance added for dataset" + i); + } + } + logger.info("Adding gzip performance"); + String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteReportSubsets); + ResultSet rstmpReportSubsets = stmt.getResultSet(); + if (rstmpReportSubsets.next()) { + String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1)); + this.readCompressedReport(unCompressedReport, reportID); + } + } + } + this.dropTmpReportsTable(); + } + + public void readCompressedReport(String report, String reportId) throws Exception { + Gson gson = new Gson(); + JsonObject jsonObject = gson.fromJson(report, JsonObject.class); + + JsonArray jsonReportDatasets; + if (jsonObject.getAsJsonArray("report_datasets") != null) { + jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets"); + } else { + jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets"); + } + + for (JsonElement datasetElement : jsonReportDatasets) { + // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title"); + String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString(); + String yop = datasetElement.getAsJsonObject().get("yop").getAsString(); + String uri = datasetElement.getAsJsonObject().get("uri").getAsString(); + String platform = datasetElement.getAsJsonObject().get("platform").getAsString(); + String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString(); + String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString(); + + JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id"); + String publisher_id_type = ""; + String publisher_id_value = ""; + for (JsonElement publisher_id_Element : publisher_id) { + publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString(); + publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString(); + } + JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates"); + String ds_dates_type = ""; + String ds_dates_value = ""; + for (JsonElement datasetDaysElement : dataset_days) { + ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString(); + ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString(); + } + + JsonArray datasetContributors = null; + String ds_contributor_type = ""; + String[] ds_contributor_values = null; + Array ds_contributor_valuesArr = null; + + if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) { + datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors"); + + JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id"); + String doi = ""; + for (JsonElement datasetIDElement : datasetid) +//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString()); + { + doi = datasetIDElement.getAsJsonObject().get("value").getAsString(); + } + + String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datasets(ds_type," + + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value," + + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) "; + + PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset); + + pstmtDataset.setString(1, doi); + pstmtDataset.setString(2, dataset_title); + pstmtDataset.setString(3, yop); + pstmtDataset.setString(4, uri); + pstmtDataset.setString(5, platform); + pstmtDataset.setString(6, data_type); + pstmtDataset.setString(7, publisher); + pstmtDataset.setString(8, publisher_id_type); + pstmtDataset.setString(9, publisher_id_value); + pstmtDataset.setString(10, ds_dates_type); + pstmtDataset.setString(11, ds_dates_value); + pstmtDataset.setString(13, datasetContributors.getAsString()); + pstmtDataset.setString(14, reportId); + + pstmtDataset.execute(); + logger.info("Dataset from compressed report addded " + doi); + /* + * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for + * (JsonElement performanceElement : performance) { JsonObject period = + * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date = + * period.getAsJsonObject().get("end-date").getAsString(); String begin_date = + * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance = + * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement : + * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject + * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set + * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[] + * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0; + * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] = + * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text", + * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype + * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod = + * instanceElement.getAsJsonObject().get("access-method").getAsString(); String + * sqlInsertDatasetPerformance = + * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)" + * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance); + * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count); + * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date); + * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod); + * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count); + * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8, + * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute(); + * } } + */ + } + } + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir)); + throw new Exception("HDFS file path with exported data does not exist : " + dir, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + sb.append(line); + // sb.append(line); + line = br.readLine(); + } + // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + result = sb.toString().trim(); + // fs.close(); + } catch (Exception e) { + throw new Exception(e); + } + + return result; + } + + public static String uncompressString(String zippedBase64Str) + throws IOException { + String result = null; + + // In my solr project, I use org.apache.solr.common.util.Base64. + // byte[] bytes = + // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str); + byte[] bytes = Base64.getDecoder().decode(zippedBase64Str); + GZIPInputStream zi = null; + try { + zi = new GZIPInputStream(new ByteArrayInputStream(bytes)); + result = IOUtils.toString(zi); + } finally { + IOUtils.closeQuietly(zi); + } + return result; + } + + private void createTmpReportsTable(String jsonFile) throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + dropTmpReportsTable(); + String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjson (json STRING)"; + stmt.executeUpdate(createTmpTable); + logger.info("Tmp Table Created"); + + String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(insertJsonReport); + logger.info("JSON Report File inserted to tmpjson Table"); + } + + private void dropTmpReportsTable() throws SQLException { + logger.info("Dropping tmpjson Table"); + String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + stmt.executeUpdate(dropTmpTable); + logger.info("Dropped tmpjson Table"); + + } + +} + +/* + * PreparedStatement prepStatem = conn. + * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)" + * ); + */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java index 28c4f30a1..7b07fbc25 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; @@ -17,220 +18,94 @@ import org.slf4j.LoggerFactory; */ public class UsageStatsExporter { - private Statement stmt = null; + private Statement stmt = null; - public UsageStatsExporter() { + public UsageStatsExporter() { - } + } - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); + logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); - logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); + logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); - } + } - public void export() throws Exception { + public void export() throws Exception { - logger.info("Initialising DB properties"); - ConnectDB.init(); - ConnectDB.getHiveConnection(); + logger.info("Initialising DB properties"); + ConnectDB.init(); + ConnectDB.getHiveConnection(); - if (ExecuteWorkflow.recreateDbAndTables) { - createDatabase(); - createTables(); - reCreateLogDirs(); - } - logger.info("Initializing the download logs module"); - DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath); + if (ExecuteWorkflow.recreateDbAndTables) { + DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", ""); + datasetsDB.recreateDBAndTables(); + } + logger.info("Initializing the download logs module"); + DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, + ExecuteWorkflow.dataciteReportPath); - if (ExecuteWorkflow.datasetsEmptyDirs) { - logger.info("Downloading Reports List From Datacite"); - drfd.downloadReportsList(); - logger.info("Reports List has been downloaded"); - } - } + if (ExecuteWorkflow.datasetsEmptyDirs) { + logger.info("Downloading Reports List From Datacite"); + drfd.downloadReportsList(); + logger.info("Reports List has been downloaded"); + } - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Reports table - This table should exist - String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" - + " name STRING, \n" - + " source STRING,\n" - + " release STRING,\n" - + " createdby STRING,\n" - + " report_end_date STRING,\n" - + " report_start_date STRING)\n" - + " CLUSTERED BY (reportid)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - - stmt.executeUpdate(sqlCreateTableDataciteeReports); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite( + ExecuteWorkflow.dataciteReportPath); + logger.info("Store Reports To DB"); + readReportsListFromDatacite.readReports(); + logger.info("Reports Stored To DB"); + } // runImpalaQuery(); -/* - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - - logger.info("Re-creating database and tables"); - - logger.info("Initializing the download logs module"); - PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - - if (ExecuteWorkflow.piwikEmptyDirs) { - logger.info("Recreating Piwik log directories"); - piwikstatsdb.reCreateLogDirs(); - } - - // Downloading piwik logs (also managing directory creation) - if (ExecuteWorkflow.downloadPiwikLogs) { - logger.info("Downloading piwik logs"); - piwd - .GetOpenAIRELogs( - ExecuteWorkflow.repoLogPath, - ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); - } - logger.info("Downloaded piwik logs"); - - // Create DB tables, insert/update statistics - String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - piwikstatsdb.setCounterRobotsURL(cRobotsUrl); - - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } - - logger.info("Creating LaReferencia tables"); - LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - ExecuteWorkflow.lareferenciaAuthToken); - - if (ExecuteWorkflow.laReferenciaEmptyDirs) { - logger.info("Recreating LaReferencia log directories"); - lrf.reCreateLogDirs(); - } - - if (ExecuteWorkflow.downloadLaReferenciaLogs) { - logger.info("Downloading LaReferencia logs"); - lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); - logger.info("Downloaded LaReferencia logs"); - } - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); - - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); - if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { - logger.info("Creating Irus Stats tables"); - irusstats.createTables(); - logger.info("Created Irus Stats tables"); - - logger.info("Re-create log dirs"); - irusstats.reCreateLogDirs(); - logger.info("Re-created log dirs"); - } - - if (ExecuteWorkflow.irusDownloadReports) { - irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); - } - if (ExecuteWorkflow.irusProcessStats) { - irusstats.processIrusStats(); - logger.info("Irus done"); - } - - SarcStats sarcStats = new SarcStats(); - if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { - sarcStats.reCreateLogDirs(); - } - if (ExecuteWorkflow.sarcDownloadReports) { - sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); - } - logger.info("Sarc done"); - - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - */ + /* + * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module"); + * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories"); + * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if + * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs( + * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); } + * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl = + * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) { + * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables"); + * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) { + * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if + * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs"); + * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } + * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if + * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs(); + * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if + * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); + * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs"); + * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) { + * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) { + * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if + * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if + * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, + * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) { + * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if + * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the + * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) { + * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End"); + */ } /* - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } + * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt = + * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close(); + * ConnectDB.getHiveConnection().close(); } */ diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 338b2a2c5..44f28ff56 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -2,13 +2,13 @@ - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 0.13.1-cdh5.2.1 2.5.0-cdh5.2.1 - - + + org.apache.spark @@ -53,16 +81,16 @@ 20180130 jar - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + eu.dnetlib.dhp dhp-common diff --git a/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh new file mode 100755 index 000000000..4465dae21 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java index f76644c83..5b2e6804b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java @@ -122,4 +122,4 @@ public abstract class ConnectDB { } -} +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java index 81e34b3e7..e0e0d3687 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -62,7 +62,6 @@ public class ExecuteWorkflow { static int sarcNumberOfIssnToDownload; static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; static int numberOfDownloadThreads; @@ -98,98 +97,108 @@ public class ExecuteWorkflow { usageStatsDBSchema = parser.get("usageStatsDBSchema"); statsDBSchema = parser.get("statsDBSchema"); - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { recreateDbAndTables = true; - else + } else { recreateDbAndTables = false; + } - if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) + if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) { piwikEmptyDirs = true; - else + } else { piwikEmptyDirs = false; + } - if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) + if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) { downloadPiwikLogs = true; - else + } else { downloadPiwikLogs = false; + } - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { processPiwikLogs = true; - else + } else { processPiwikLogs = false; + } - String startingLogPeriodStr = parser.get("startingLogPeriod"); + String startingLogPeriodStr = parser.get("startingLogPeriod"); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); - if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) + if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) { laReferenciaEmptyDirs = true; - else + } else { laReferenciaEmptyDirs = false; + } - if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) + if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) { downloadLaReferenciaLogs = true; - else + } else { downloadLaReferenciaLogs = false; + } - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { processLaReferenciaLogs = true; - else + } else { processLaReferenciaLogs = false; + } - if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) { irusCreateTablesEmptyDirs = true; - else + } else { irusCreateTablesEmptyDirs = false; + } - if (parser.get("irusDownloadReports").toLowerCase().equals("true")) + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) { irusDownloadReports = true; - else + } else { irusDownloadReports = false; + } - if (parser.get("irusProcessStats").toLowerCase().equals("true")) + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { irusProcessStats = true; - else + } else { irusProcessStats = false; + } irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); - if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) { sarcCreateTablesEmptyDirs = true; - else + } else { sarcCreateTablesEmptyDirs = false; + } - if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) + if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) { sarcDownloadReports = true; - else + } else { sarcDownloadReports = false; + } - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { sarcProcessStats = true; - else + } else { sarcProcessStats = false; + } sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); -/* - if (parser.get("finalizeStats").toLowerCase().equals("true")) + if (parser.get("finalizeStats").toLowerCase().equals("true")) { finalizeStats = true; - else + } else { finalizeStats = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; -*/ + } + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); + // usagestatsExport.createdDBWithTablesOnly(); } private static Calendar startingLogPeriodStr(Date date) { diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java index bb8d8565e..7ec5b0fca 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -27,393 +28,331 @@ import org.slf4j.LoggerFactory; */ public class IrusStats { - private String irusUKURL; + private String irusUKURL; - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - public IrusStats(String irusUKURL) throws Exception { - this.irusUKURL = irusUKURL; - // The following may not be needed - It will be created when JSON tables are created + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created // createTmpTables(); - } + } - public void reCreateLogDirs() throws Exception { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - } + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } - public void createTables() throws Exception { - try { - logger.info("Creating sushilog"); - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog(source STRING, " - + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " - + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableSushiLog); - logger.info("Created sushilog"); + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); - // To see how to apply to the ignore duplicate rules and indexes -// stmt.executeUpdate(sqlCreateTableSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilog " -// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," -// + "sushilog.rid, sushilog.date " -// + "FROM sushilog " -// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; -// stmt.executeUpdate(createSushiIndex); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } -// // The following may not be needed - It will be created when JSON tables are created -// private void createTmpTables() throws Exception { -// try { -// -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; -// stmt.executeUpdate(sqlCreateTableSushiLog); -// -// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); -// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; -// // stmt.executeUpdate(sqlCopyPublicSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilogtmp " -// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," -// + "sushilogtmp.rid, sushilogtmp.date " -// + "FROM sushilogtmp " -// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// -// stmt.close(); -// ConnectDB.getConnection().close(); -// log.info("Sushi Tmp Tables Created"); -// } catch (Exception e) { -// log.error("Failed to create tables: " + e); -// throw new Exception("Failed to create tables: " + e.toString(), e); -// } -// } - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); - logger.info("Dropping sushilogtmp_json table"); - String dropSushilogtmpJson = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sushilogtmp_json"; - stmt.executeUpdate(dropSushilogtmpJson); - logger.info("Dropped sushilogtmp_json table"); + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); - logger.info("Creating irus_sushilogtmp_json table"); - String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " Type: STRING,\n" - + " Value: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` ARRAY<\n" - + " struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + " >\n" - + ")\n" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(createSushilogtmpJson); - logger.info("Created irus_sushilogtmp_json table"); + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); - logger.info("Dropping irus_sushilogtmp table"); - String dropSushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(dropSushilogtmp); - logger.info("Dropped irus_sushilogtmp table"); + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); - logger.info("Creating irus_sushilogtmp table"); - String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(createSushilogtmp); - logger.info("Created irus_sushilogtmp table"); + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); - logger.info("Inserting to irus_sushilogtmp table"); - String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " - + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " - + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " - + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " - + "WHERE `ItemIdent`.`Type`= 'OAI'"; - stmt.executeUpdate(insertSushilogtmp); - logger.info("Inserted to irus_sushilogtmp table"); -/* - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); - logger.info("Inserting into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); -*/ - logger.info("Inserting to sushilog table"); - String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(insertToShushilog); - logger.info("Inserted to sushilog table"); + ConnectDB.getHiveConnection().close(); + } - ConnectDB.getHiveConnection().close(); - } + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); - public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) - + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; - logger.info("(getIrusRRReport) Getting report: " + reportUrl); + logger.info("(getIrusRRReport) Getting report: " + reportUrl); - String text = getJson(reportUrl, "", ""); + String text = getJson(reportUrl, "", ""); - List opendoarsToVisit = new ArrayList(); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - int i = 0; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); - for (Object identifier : itemIdentifier) { - JSONObject opendoar = (JSONObject) identifier; - if (opendoar.get("Type").toString().equals("OpenDOAR")) { - i++; - opendoarsToVisit.add(opendoar.get("Value").toString()); - break; - } - } - // break; - } + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray != null) { + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } - logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); - if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 - && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - } + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 + && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } - logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); - for (String opendoar : opendoarsToVisit) { - logger.info("Now working on openDoar: " + opendoar); - this.getIrusIRReport(opendoar, irusUKReportPath); - } + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } else { + logger.info("IRUS Reports not found for day"); + } - logger.info("(getIrusRRReport) Finished with report: " + reportUrl); - } + } - private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { - logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); - ConnectDB.getHiveConnection().setAutoCommit(false); + ConnectDB.getHiveConnection().setAutoCommit(false); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + // Setting the ending period (last day of the month) + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, "opendoar____::" + opendoar); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - int batch_size = 0; +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); - } else { - while (start.before(end)) { - logger.info("date: " + simpleDateFormat.format(start.getTime())); - String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) - + "&RepositoryIdentifier=opendoar%3A" + opendoar - + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; - start.add(Calendar.MONTH, 1); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + int batch_size = 0; - logger.info("Downloading file: " + reportUrl); - String text = getJson(reportUrl, "", ""); - if (text == null) { - continue; - } + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); - FileSystem fs = FileSystem.get(new Configuration()); - String filePath = irusUKReportPath + "/" + "IrusIRReport_" - + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePath); - FSDataOutputStream fin = fs.create(new Path(filePath), true); + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - if (jsonArray == null) { - continue; - } - String oai = ""; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - fin.write(jsonObjectRow.toJSONString().getBytes()); - fin.writeChar('\n'); - } + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); - fin.close(); - } + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } - } - //ConnectDB.getHiveConnection().close(); + fin.close(); + } - logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); - } + } + // ConnectDB.getHiveConnection().close(); - private String getJson(String url) throws Exception { - try { - System.out.println("===> Connecting to: " + url); - URL website = new URL(url); - System.out.println("Connection url -----> " + url); - URLConnection connection = website.openConnection(); + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - System.out.println("response ====> " + response.toString()); + System.out.println("response ====> " + response.toString()); - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - private String getJson(String url, String username, String password) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL", e); - return null; - } - } + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java index 88550579b..904290af8 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -27,49 +28,49 @@ import org.slf4j.LoggerFactory; */ public class LaReferenciaDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); - public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - this.createTables(); + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); // this.createTmpTables(); - } + } - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " - + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); // String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " // + " ON INSERT TO lareferencialog " // + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," @@ -80,16 +81,16 @@ public class LaReferenciaDownloadLogs { // stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); // stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } // private void createTmpTables() throws Exception { // @@ -114,152 +115,159 @@ public class LaReferenciaDownloadLogs { // // System.exit(0); // } // } - private String getPiwikLogUrl() { - return piwikUrl + "/"; - } + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } - private String getJson(String url) throws Exception { - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { - String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; - List siteIdsToVisit = new ArrayList(); + List siteIdsToVisit = new ArrayList(); - // Getting all the siteIds in a list for logging reasons & limiting the list - // to the max number of siteIds - content = getJson(baseApiUrl); - JSONParser parser = new JSONParser(); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); - } - logger.info("Found the following siteIds for download: " + siteIdsToVisit); + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - for (int siteId : siteIdsToVisit) { - logger.info("Now working on LaReferencia MatomoId: " + siteId); - this.GetLaReFerenciaLogs(repoLogsPath, siteId); - } - } + for (int siteId : siteIdsToVisit) { + logger.info("Now working on LaReferencia MatomoId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } - public void GetLaReFerenciaLogs(String repoLogsPath, - int laReferencialMatomoID) throws Exception { + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { - logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".lareferencialog WHERE matomoid=?"); - st.setInt(1, laReferencialMatomoID); - Date dateMax = null; + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=?"); + st.setInt(1, laReferencialMatomoID); + Date dateMax = null; - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - Date date = currDay.getTime(); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); - } else { - logger - .info( - "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " - + sdf.format(date)); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - outFolder = repoLogsPath; + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger + .info( + "Date found in logs " + dateMax + " and not downloanding Matomo logs for " + + laReferencialMatomoID); + } else { + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), - true); + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - int i = 0; + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path( + outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); - JSONParser parser = new JSONParser(); - do { - String apiUrl = baseApiUrl; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - fin.write(jsonObjectRaw.toJSONString().getBytes()); - fin.writeChar('\n'); - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - logger - .info( - "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID - + " and for " - + sdf.format(date)); - i++; - } while (true); - fin.close(); - } - } - } + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + } + } + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java index c20781767..bcf1711cb 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java @@ -61,15 +61,6 @@ public class LaReferenciaStats { "stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableLareferenciaLog); logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); stmt.close(); ConnectDB.getHiveConnection().close(); @@ -82,30 +73,6 @@ public class LaReferenciaStats { } } -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - public void processLogs() throws Exception { try { logger.info("Processing LaReferencia repository logs"); @@ -116,16 +83,7 @@ public class LaReferenciaStats { removeDoubleClicks(); logger.info("LaReferencia removed double clicks"); -/******** - logger.info("LaReferencia creating viewsStats"); - viewsStats(); - logger.info("LaReferencia created viewsStats"); - logger.info("LaReferencia creating downloadsStats"); - downloadsStats(); - logger.info("LaReferencia created downloadsStats"); - -************/ - logger.info("LaReferencia updating Production Tables"); + logger.info("LaReferencia updating Production Tables"); updateProdTables(); logger.info("LaReferencia updated Production Tables"); @@ -255,88 +213,6 @@ public class LaReferenciaStats { // conn.close(); } - public void viewsStats() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating la_result_views_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS " - + - "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + - "THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='action' and " + - "(source_item_type='oaItem' or source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; - stmt.executeUpdate(sql); - logger.info("Created la_result_views_monthly_tmp view"); - - logger.info("Dropping la_views_stats_tmp table"); - sql = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".la_views_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Dropped la_views_stats_tmp table"); - - logger.info("Creating la_views_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " + - "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.oid AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(sql); - logger.info("Created la_views_stats_tmp table"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - - private void downloadsStats() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating la_result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".la_result_downloads_monthly_tmp AS " + - "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + - "THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='download' and " + - "(source_item_type='oaItem' or source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; - stmt.executeUpdate(sql); - logger.info("Created la_result_downloads_monthly_tmp view"); - - logger.info("Dropping la_downloads_stats_tmp table"); - sql = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".la_downloads_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Dropped la_downloads_stats_tmp table"); - - logger.info("Creating la_downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " + - "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.oid AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(sql); - logger.info("Created la_downloads_stats_tmp table"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - private void updateProdTables() throws SQLException, Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); @@ -346,40 +222,11 @@ public class LaReferenciaStats { String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " + "select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; stmt.executeUpdate(sql); -/***** - logger.info("Updating views_stats"); - sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; - stmt.executeUpdate(sql); -// sql = "insert into public.views_stats select * from la_views_stats_tmp;"; -// stmt.executeUpdate(sql); - - logger.info("Updating downloads_stats"); - sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to usage_stats from lareferencia"); - sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(sql); - logger.info("Inserted data to usage_stats from lareferencia"); -// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;"; -// stmt.executeUpdate(sql); -****/ logger.info("Dropping lareferencialogtmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; - logger.info("Dropped lareferencialogtmp"); - stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogtmp"); + stmt.executeUpdate(sql); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index 5cc9ec563..a84d6743f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -1,9 +1,12 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; import java.net.Authenticator; import java.net.URL; import java.net.URLConnection; +import java.nio.file.Files; +import java.nio.file.Paths; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.Statement; @@ -30,299 +33,299 @@ import org.slf4j.LoggerFactory; */ public class PiwikDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; - private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); - public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; - } + } - private String getPiwikLogUrl() { - return "https://" + piwikUrl + "/"; - } + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } - private String getJson(String url) throws Exception { - try { - logger.debug("Connecting to download the JSON: " + url); - URL website = new URL(url); - URLConnection connection = website.openConnection(); + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + url + " Exception: " + e); - throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); - } - } + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } - class WorkerThread implements Runnable { + class WorkerThread implements Runnable { - private Calendar currDay; - private int siteId; - private String repoLogsPath; - private String portalLogPath; - private String portalMatomoID; + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; - public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws IOException { - this.currDay = (Calendar) currDay.clone(); - this.siteId = new Integer(siteId); - this.repoLogsPath = new String(repoLogsPath); - this.portalLogPath = new String(portalLogPath); - this.portalMatomoID = new String(portalMatomoID); - } + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } - public void run() { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - System.out - .println( - Thread.currentThread().getName() + " (Start) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - try { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - System.out - .println( - Thread.currentThread().getName() + " (End) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - int i = 0; + int i = 0; - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - writtenBytes += jsonObjectRawBytes.length + 1; - } + writtenBytes += jsonObjectRawBytes.length + 1; + } - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - i++; - } while (true); + i++; + } while (true); - fs.close(); - } - } + fs.close(); + } + } - public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { - Statement statement = ConnectDB.getHiveConnection().createStatement(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - ResultSet rs = statement - .executeQuery( - "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); - // Getting all the piwikids in a list for logging reasons & limitting the list - // to the max number of piwikids - List piwikIdToVisit = new ArrayList(); - //while (rs.next()) - //piwikIdToVisit.add(rs.getInt(1)); - piwikIdToVisit.add(13); - piwikIdToVisit.add(109); - - logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); + while (rs.next()) { + piwikIdToVisit.add(rs.getInt(1)); + } + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { - logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); - // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + // Setting the ending period (last day of the month) + // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + // end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - logger.info("Now working on piwikId: " + siteId); + logger.info("Now working on piwikId: " + siteId); - PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog WHERE source=?"); - st.setInt(1, siteId); - Date dateMax = null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - // logger.info("Date used " + currDay.toString()); - // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - // executor.execute(worker);// calling execute method of ExecutorService - logger.info("Date used " + currDay.getTime().toString()); + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); - } else { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } - } - } - // executor.shutdown(); - // while (!executor.isTerminated()) { - // } - // System.out.println("Finished all threads"); - } + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - int i = 0; + int i = 0; - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - writtenBytes += jsonObjectRawBytes.length + 1; - } + writtenBytes += jsonObjectRawBytes.length + 1; + } - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - i++; - } while (true); + i++; + } while (true); - fs.close(); - } + fs.close(); + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java index 4903d9599..9144620b7 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -60,7 +60,7 @@ public class PiwikStatsDB { this.createTables(); // The piwiklog table is not needed since it is built // on top of JSON files - ////////////this.createTmpTables(); + //////////// this.createTmpTables(); } public ArrayList getRobotsList() { @@ -86,6 +86,7 @@ public class PiwikStatsDB { logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; stmt.executeUpdate(dropDatabase); + } catch (Exception e) { logger.error("Failed to drop database: " + e); throw new Exception("Failed to drop database: " + e.toString(), e); @@ -117,10 +118,15 @@ public class PiwikStatsDB { + "into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTablePiwikLog); +// String dropT = "TRUNCATE TABLE " +// + ConnectDB.getUsageStatsDBSchema() +// + ".piwiklog "; +// stmt.executeUpdate(dropT); +// logger.info("truncated piwiklog"); + ///////////////////////////////////////// // Rule for duplicate inserts @ piwiklog ///////////////////////////////////////// - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " @@ -131,7 +137,6 @@ public class PiwikStatsDB { ////////////////////////////////////////////////// // Rule for duplicate inserts @ process_portal_log ////////////////////////////////////////////////// - stmt.close(); ConnectDB.getHiveConnection().close(); @@ -141,47 +146,6 @@ public class PiwikStatsDB { } } -/***** public void createTmpTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePiwikLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ piwiklogtmp - ////////////////////////////////////////////////// - - ////////////////////////////////////////////////// - // Copy from public.piwiklog to piwiklog - ////////////////////////////////////////////////// - // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; - // stmt.executeUpdate(sqlCopyPublicPiwiklog); - - String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log_tmp - ////////////////////////////////////////////////// - - stmt.close(); - - } catch (Exception e) { - logger.error("Failed to create tmptables: " + e); - throw new Exception("Failed to create tmp tables: " + e.toString(), e); - // System.exit(0); - } - } -******/ public void processLogs() throws Exception { try { ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); @@ -203,23 +167,17 @@ public class PiwikStatsDB { processPortalLog(); logger.info("Portal logs process done"); - logger.info("Processing portal usagestats"); - portalStats(); + logger.info("Processing portal usagestats"); + portalLogs(); logger.info("Portal usagestats process done"); -/***** - logger.info("ViewsStats processing starts"); - viewsStats(); - logger.info("ViewsStats processing ends"); - - logger.info("DownloadsStats processing starts"); - downloadsStats(); - logger.info("DownloadsStats processing starts"); -*****/ logger.info("Updating Production Tables"); updateProdTables(); logger.info("Updated Production Tables"); + logger.info("Create Pedocs Tables"); + createPedocsOldUsageData(); + logger.info("Pedocs Tables Created"); } catch (Exception e) { logger.error("Failed to process logs: " + e); @@ -237,65 +195,65 @@ public class PiwikStatsDB { logger.info("Added JSON Serde jar"); logger.info("Dropping piwiklogtmp_json table"); - String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json"; + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; stmt.executeUpdate(drop_piwiklogtmp_json); logger.info("Dropped piwiklogtmp_json table"); logger.info("Creating piwiklogtmp_json"); - String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >\n" + - " >,\n" + - " timestamp: String\n" + - " >\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >\n" + + " >,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_piwiklogtmp_json); logger.info("Created piwiklogtmp_json"); logger.info("Dropping piwiklogtmp table"); - String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp"; + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; stmt.executeUpdate(drop_piwiklogtmp); logger.info("Dropped piwiklogtmp"); logger.info("Creating piwiklogtmp"); - String create_piwiklogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(create_piwiklogtmp); logger.info("Created piwiklogtmp"); logger.info("Inserting into piwiklogtmp"); - String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent\n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; stmt.executeUpdate(insert_piwiklogtmp); logger.info("Inserted into piwiklogtmp"); @@ -308,33 +266,31 @@ public class PiwikStatsDB { logger.info("Cleaning download double clicks"); // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "WHERE EXISTS (\n" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" - + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + - "AND p1.timestamp\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; + String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json(" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_process_portal_log_tmp_json); logger.info("Created process_portal_log_tmp_json"); logger.info("Droping process_portal_log_tmp table"); - String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp"; + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; stmt.executeUpdate(drop_process_portal_log_tmp); logger.info("Dropped process_portal_log_tmp"); logger.info("Creating process_portal_log_tmp"); - String create_process_portal_log_tmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(create_process_portal_log_tmp); logger.info("Created process_portal_log_tmp"); logger.info("Inserting into process_portal_log_tmp"); String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " - + - "actiondetail.url as url, " + - "CASE\n" + - " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + - " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " - + - " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + - " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + - " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + - " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + - " ELSE '' " + - "END AS entity_id, " + - "CASE " + - " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + - " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + - " ELSE '' " + - "END AS source_item_type, " + - "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + - "browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; stmt.executeUpdate(insert_process_portal_log_tmp); logger.info("Inserted into process_portal_log_tmp"); stmt.close(); } - public void portalStats() throws SQLException { + public void portalLogs() throws SQLException { Connection con = ConnectDB.getHiveConnection(); Statement stmt = con.createStatement(); con.setAutoCommit(false); -// Original queries where of the style -// -// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp2, -// openaire_prod_stats_20200821.result_oids roid -// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null -// -// The following query is an example of how queries should be -// -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); -// -// We should consider if we would like the queries to be as the following -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND -// roid.oid != ''); - logger.info("PortalStats - Step 1"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".result_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); logger.info("PortalStats - Step 2"); stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); @@ -494,12 +421,11 @@ public class PiwikStatsDB { */ logger.info("PortalStats - Step 3"); stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".project_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); @@ -512,233 +438,233 @@ public class PiwikStatsDB { logger.info("Cleaning oai - Step 1"); stmt = ConnectDB.getHiveConnection().createStatement(); - String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + - "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 2"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + - "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 3"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + - "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 4"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + - "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 5"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + - "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 6"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + - "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 7"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + - "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 8"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + - "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 9"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + - "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 10"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + - "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 11"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + - "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 12"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + - "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 13"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + - "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 14"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + - "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 15"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + - "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 16"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + - "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 17"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + - "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 18"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + - "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 19"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + - "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 20"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + - "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 21"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + - "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 22"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + - "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 23"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + - "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 24"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + - "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 25"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + - "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 26"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + - "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 27"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + - "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 28"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + - "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 29"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + - "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; stmt.executeUpdate(sql); stmt.close(); @@ -746,63 +672,83 @@ public class PiwikStatsDB { ConnectDB.getHiveConnection().close(); } - private String processPortalURL(String url) { - - if (url.indexOf("explore.openaire.eu") > 0) { - try { - url = URLDecoder.decode(url, "UTF-8"); - } catch (Exception e) { - logger.info("Error when decoding the following URL: " + url); - } - if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); - } else if (url.indexOf("datasource=") > 0 - && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { - url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); - } else if (url.indexOf("datasourceFilter=") > 0 - && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); - } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); - } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); - } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 - && !url.contains("oai:dnet:corda")) { - url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); - } else if (url.indexOf("organizationId=") > 0 - && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { - url = "organization|" - + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); - } else { - url = ""; - } - } else { - url = ""; - } - - return url; - } - private void updateProdTables() throws SQLException { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Inserting data to piwiklog"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; stmt.executeUpdate(sql); - + logger.info("Dropping piwiklogtmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; stmt.executeUpdate(sql); - logger.info("Dropped piwiklogtmp"); - + logger.info("Dropped piwiklogtmp"); + logger.info("Dropping process_portal_log_tmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); - logger.info("Dropped process_portal_log_tmp"); + logger.info("Dropped process_portal_log_tmp"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + public void finalizeStats() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping piwiklogtmp"); + String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp"); + + logger.info("Dropping process_portal_log_tmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Dropping irus_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp"); + + logger.info("Dropping irus_sushilogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp_json"); + + logger.info("Dropping lareferencialogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogtmp_json"); + + logger.info("Dropping piwiklogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp_json"); + + logger.info("Dropping process_portal_log_tmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp_json"); + + logger.info("Dropping sarc_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp"); + + logger.info("Dropping sarc_sushilogtmp_json_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_array"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_non_array"); stmt.close(); ConnectDB.getHiveConnection().close(); @@ -868,4 +814,22 @@ public class PiwikStatsDB { private Connection getConnection() throws SQLException { return ConnectDB.getHiveConnection(); } + + public void createPedocsOldUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating PeDocs Old Views Table"); + String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsoldviews as select * from default.pedocsviews"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Views Table created"); + + logger.info("Creating PeDocs Old Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsolddownloads as select * from default.pedocsdownloads"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Downloads Table created"); + + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java index 4ca20c52e..e85c972f5 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -33,543 +34,467 @@ import org.slf4j.LoggerFactory; */ public class SarcStats { - private Statement stmtHive = null; - private Statement stmtImpala = null; + private Statement stmtHive = null; + private Statement stmtImpala = null; - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - public SarcStats() throws Exception { + public SarcStats() throws Exception { // createTables(); - } + } - private void createTables() throws Exception { - try { + private void createTables() throws Exception { + try { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } - public void reCreateLogDirs() throws IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); + logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); - logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); + logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); - logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); + logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); - logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); - } + logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); + } - public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); - logger.info("Dropping sarc_sushilogtmp_json_array table"); - String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); - logger.info("Dropped sarc_sushilogtmp_json_array table"); + logger.info("Dropping sarc_sushilogtmp_json_array table"); + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); + logger.info("Dropped sarc_sushilogtmp_json_array table"); - logger.info("Creating sarc_sushilogtmp_json_array table"); - String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - logger.info("Created sarc_sushilogtmp_json_array table"); + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); - logger.info("Dropping sarc_sushilogtmp_json_non_array table"); - String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp_json_non_array table"); - String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" - + " `ItemIdentifier` struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathNonArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - logger.info("Created sarc_sushilogtmp_json_non_array table"); + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp table"); - String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(create_sarc_sushilogtmp); - logger.info("Created sarc_sushilogtmp table"); + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "WHERE `ItemIdent`.`Type`='DOI'"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - ConnectDB.getHiveConnection().close(); - } + ConnectDB.getHiveConnection().close(); + } - public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); - logger.info("Dropping sarc_sushilogtmp table"); - String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp"; - stmt.executeUpdate(drop_sarc_sushilogtmp); - logger.info("Dropped sarc_sushilogtmp table"); - ConnectDB.getHiveConnection().close(); + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); - List issnAndUrls = new ArrayList(); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" - }); - issnAndUrls.add(new String[]{ - "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" - }); + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[] { + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); - if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 - && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); - issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); - } + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 + && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } - logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); - for (String[] issnAndUrl : issnAndUrls) { - logger.info("Now working on ISSN: " + issnAndUrl[1]); - getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); - } + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } - } + } - public void finalizeSarcStats() throws Exception { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - stmtImpala = ConnectDB.getImpalaConnection().createStatement(); -/* - logger.info("Creating downloads_stats table_tmp"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats_tmp table"); + public void updateSarcLogs() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - logger.info("Dropping sarc_sushilogtmp_impala table"); - String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala"; - stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); - logger.info("Dropped sarc_sushilogtmp_impala table"); + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); - logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); - String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala " - + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(createSarcSushilogtmpImpala); - logger.info("Created sarc_sushilogtmp_impala"); + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } - logger.info("Making sarc_sushilogtmp visible to impala"); - String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala;"; - stmtImpala.executeUpdate(invalidateMetadata); + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Dropping downloads_stats_impala table"); - String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala"; - stmtHive.executeUpdate(drop_downloads_stats_impala); - logger.info("Dropped downloads_stats_impala table"); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - logger.info("Making downloads_stats_impala deletion visible to impala"); - try { - String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala;"; - stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); - } catch (SQLException sqle) { - } + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - // We run the following query in Impala because it is faster - logger.info("Creating downloads_stats_impala"); - String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala AS " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmtImpala.executeUpdate(createDownloadsStatsImpala); - logger.info("Creating downloads_stats_impala"); + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp SELECT * " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; - stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats_tmp"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmtHive.executeUpdate(createSushilog); - logger.info("Created sushilog table"); -*/ - // Insert into sushilog - logger.info("Inserting into sushilog"); - String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(insertSushiLog); - logger.info("Inserted into sushilog"); + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - } + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); - public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, - String url, String issn) throws Exception { - logger.info("Processing SARC! issn: " + issn + " with url: " + url); - ConnectDB.getHiveConnection().setAutoCommit(false); + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, issn); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); - // Creating the needed configuration for the correct storing of data - Configuration config = new Configuration(); - config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); - config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); - config - .set( - "fs.hdfs.impl", - org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - config - .set( - "fs.file.impl", - org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem dfs = FileSystem.get(config); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); - } else { - - while (start.before(end)) { - String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); - start.add(Calendar.MONTH, 1); + for (Object aJsonArray : jsonArray) { - logger.info("(getARReport) Getting report: " + reportUrl); - String text = getJson(reportUrl); - if (text == null) { - continue; - } + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = null; - try { - jsonObject = (JSONObject) parser.parse(text); - } // if there is a parsing error continue with the next url - catch (ParseException pe) { - continue; - } + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } - jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("sc:Report"); - if (jsonObject == null) { - continue; - } - jsonObject = (JSONObject) jsonObject.get("c:Report"); - jsonObject = (JSONObject) jsonObject.get("c:Customer"); - Object obj = jsonObject.get("c:ReportItems"); - JSONArray jsonArray = new JSONArray(); - if (obj instanceof JSONObject) { - jsonArray.add(obj); - } else { - jsonArray = (JSONArray) obj; - // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); - } - if (jsonArray == null) { - continue; - } + finArray.close(); + finNonArray.close(); - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathArray); - FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) { + fileArray.delete(); + } + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) { + fileNonArray.delete(); + } - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathNonArray); - FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + } - for (Object aJsonArray : jsonArray) { + dfs.close(); + } + // ConnectDB.getHiveConnection().close(); + } - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - renameKeysRecursively(":", jsonObjectRow); - - if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { - finNonArray.write(jsonObjectRow.toJSONString().getBytes()); - finNonArray.writeChar('\n'); - } else { - finArray.write(jsonObjectRow.toJSONString().getBytes()); - finArray.writeChar('\n'); - } - } - - finArray.close(); - finNonArray.close(); - - // Check the file size and if it is too big, delete it - File fileArray = new File(filePathArray); - if (fileArray.length() == 0) - fileArray.delete(); - File fileNonArray = new File(filePathNonArray); - if (fileNonArray.length() == 0) - fileNonArray.delete(); - - } - - dfs.close(); - } - //ConnectDB.getHiveConnection().close(); - } - - private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { - for (Object jjval : givenJsonObj) { - if (jjval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jjval); - } else if (jjval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jjval); - } // All other types of vals - else + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jjval); + } else if (jjval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jjval); + } // All other types of vals + else ; - } - } + } + } - private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { - Set jkeys = new HashSet(givenJsonObj.keySet()); - for (String jkey : jkeys) { + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { - String[] splitArray = jkey.split(delimiter); - String newJkey = splitArray[splitArray.length - 1]; + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; - Object jval = givenJsonObj.get(jkey); - givenJsonObj.remove(jkey); - givenJsonObj.put(newJkey, jval); + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); - if (jval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jval); - } + if (jval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jval); + } - if (jval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jval); - } - } - } + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } - private String getJson(String url) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { - // Logging error and silently continuing - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); // return null; // throw new Exception("Failed to get URL: " + e.toString(), e); - } - return ""; - } + } + return ""; + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index bf2187569..07e15605f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -13,7 +13,7 @@ import org.slf4j.LoggerFactory; /** * Main class for downloading and processing Usage statistics - * + * * @author D. Pierrakos, S. Zoupanos */ public class UsageStatsExporter { @@ -51,19 +51,13 @@ public class UsageStatsExporter { logger.info("Initialising DB properties"); ConnectDB.init(); -// runImpalaQuery(); - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables){ + if (ExecuteWorkflow.recreateDbAndTables) { piwikstatsdb.recreateDBAndTables(); - logger.info("DB-Tables-TmpTables are created "); - } -// else { -// piwikstatsdb.createTmpTables(); -// logger.info("TmpTables are created "); -// } + logger.info("DB-Tables-TmpTables are created "); + } logger.info("Initializing the download logs module"); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); @@ -106,9 +100,8 @@ public class UsageStatsExporter { lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } - - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); @@ -116,7 +109,6 @@ public class UsageStatsExporter { logger.info("LaReferencia logs done"); } - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); @@ -132,14 +124,11 @@ public class UsageStatsExporter { irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } - - if (ExecuteWorkflow.irusProcessStats) { + if (ExecuteWorkflow.irusProcessStats) { irusstats.processIrusStats(); logger.info("Irus done"); } - - SarcStats sarcStats = new SarcStats(); if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); @@ -148,51 +137,70 @@ public class UsageStatsExporter { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); } - - if (ExecuteWorkflow.sarcProcessStats) { + if (ExecuteWorkflow.sarcProcessStats) { sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); + sarcStats.updateSarcLogs(); } logger.info("Sarc done"); - - -/* // finalize usagestats + + logger.info("Dropping tmp tables"); if (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); + logger.info("Dropped tmp tables"); } -*/ -/* - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } -*/ - - logger.info("End"); + logger.info("Raw Data Download End"); } - private void invalidateMetadata() throws SQLException { - Statement stmt = null; + public void createdDBWithTablesOnly() throws Exception { + logger.info("Initialising DB properties"); + ConnectDB.init(); - stmt = ConnectDB.getImpalaConnection().createStatement(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + piwikstatsdb.recreateDBAndTables(); - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + piwikstatsdb.createPedocsOldUsageData(); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); + + logger.info("Creating sushilog"); + + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + logger.info("Updating piwiklog"); + String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog select * from openaire_prod_usage_raw.piwiklog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + logger.info("Updating lareferencialog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + logger.info("Updating sushilog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog select * from openaire_prod_usage_raw.sushilog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); + stmt.close(); ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } + } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json index 988c23b48..1aa5ad6f8 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -125,12 +125,6 @@ "paramDescription": "Starting log period", "paramRequired": true }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, { "paramName": "npidd", "paramLongName": "numberOfPiwikIdsToDownload", @@ -216,12 +210,6 @@ "paramDescription": "Create the usage_stats table?", "paramRequired": true }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, { "paramName": "nodt", "paramLongName": "numberOfDownloadThreads", diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml index a6600516d..022a107ab 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -63,7 +63,6 @@ --downloadPiwikLogs${downloadPiwikLogs} --processPiwikLogs${processPiwikLogs} --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} --laReferenciaEmptyDirs${laReferenciaEmptyDirs} @@ -78,7 +77,6 @@ --sarcProcessStats${sarcProcessStats} --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} - --finalTablesVisibleToImpala${finalTablesVisibleToImpala} --numberOfDownloadThreads${numberOfDownloadThreads} diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index f400239f5..79fabb603 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -23,7 +23,35 @@ 4.0.0 dhp-usage-stats-build - + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + UTF-8 UTF-8 diff --git a/dhp-workflows/dhp-usage-stats-build/runworkflow.sh b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh new file mode 100755 index 000000000..191fb24c6 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java index 8f0f8eae7..e53709f1a 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -3,12 +3,17 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; import java.util.Properties; import org.apache.log4j.Logger; @@ -23,108 +28,120 @@ import com.mchange.v2.c3p0.ComboPooledDataSource; public abstract class ConnectDB { - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String usageRawDataDBSchema; - private static String usageStatsDBSchema; - private static String statsDBSchema; - private final static Logger log = Logger.getLogger(ConnectDB.class); + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageRawDataDBSchema; + private static String usageStatsDBSchema; + private static String usagestatsPermanentDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); - static void init() throws ClassNotFoundException { + static void init() throws ClassNotFoundException { - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema; - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); - return DB_HIVE_CONNECTION; - } - } + return DB_HIVE_CONNECTION; + } + } - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); - return DB_IMPALA_CONNECTION; - } - } + return DB_IMPALA_CONNECTION; + } + } - public static String getUsageRawDataDBSchema() { - return usageRawDataDBSchema; - } + public static String getUsageRawDataDBSchema() { + return ConnectDB.usageRawDataDBSchema; + } - public static String getUsageStatsDBSchema() { - return ConnectDB.usageStatsDBSchema; - } + public static String getUsageStatsDBSchema() { + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } + return ConnectDB.usageStatsDBSchema + "_" + todayAsString; + } - private static Connection connectHive() throws SQLException { - /* + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + public static String getUsagestatsPermanentDBSchema() { + return ConnectDB.usagestatsPermanentDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(30); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); - } + } - private static Connection connectImpala() throws SQLException { - /* + private static Connection connectImpala() throws SQLException { + /* * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(30); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); + return cpds.getConnection(); - } + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java index 3f958abba..26e44b1f6 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java @@ -3,6 +3,7 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.text.SimpleDateFormat; @@ -11,162 +12,142 @@ import java.util.Date; import org.apache.commons.io.IOUtils; import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** * @author D. Pierrakos, S. Zoupanos */ public class ExecuteWorkflow { - static String matomoAuthToken; - static String matomoBaseURL; - static String repoLogPath; - static String portalLogPath; - static String portalMatomoID; - static String irusUKBaseURL; - static String irusUKReportPath; - static String sarcsReportPathArray; - static String sarcsReportPathNonArray; - static String lareferenciaLogPath; - static String lareferenciaBaseURL; - static String lareferenciaAuthToken; - static String dbHiveUrl; - static String dbImpalaUrl; - static String usageRawDataDBSchema; - static String usageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; +// static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; +// static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; +// static String lareferenciaBaseURL; +// static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageRawDataDBSchema; + static String usageStatsDBSchema; + static String usagestatsPermanentDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; - static boolean piwikEmptyDirs; - static boolean downloadPiwikLogs; - static boolean processPiwikLogs; + static boolean processPiwikLogs; + static boolean processLaReferenciaLogs; - static Calendar startingLogPeriod; - static Calendar endingLogPeriod; - static int numberOfPiwikIdsToDownload; - static int numberOfSiteIdsToDownload; + static boolean irusProcessStats; - static boolean laReferenciaEmptyDirs; - static boolean downloadLaReferenciaLogs; - static boolean processLaReferenciaLogs; + static boolean sarcProcessStats; - static boolean irusCreateTablesEmptyDirs; - static boolean irusDownloadReports; - static boolean irusProcessStats; - static int irusNumberOfOpendoarsToDownload; + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; - static boolean sarcCreateTablesEmptyDirs; - static boolean sarcDownloadReports; - static boolean sarcProcessStats; - static int sarcNumberOfIssnToDownload; + static int numberOfDownloadThreads; - static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - static int numberOfDownloadThreads; + public static void main(String args[]) throws Exception { - private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + // Sending the logs to the console + BasicConfigurator.configure(); - public static void main(String args[]) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); + parser.parseArgument(args); - // Sending the logs to the console - BasicConfigurator.configure(); + // Setting up the initial parameters +// matomoAuthToken = parser.get("matomoAuthToken"); +// matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); +// irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); +// lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); +// lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); - parser.parseArgument(args); + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); - // Setting up the initial parameters - matomoAuthToken = parser.get("matomoAuthToken"); - matomoBaseURL = parser.get("matomoBaseURL"); - repoLogPath = parser.get("repoLogPath"); - portalLogPath = parser.get("portalLogPath"); - portalMatomoID = parser.get("portalMatomoID"); - irusUKBaseURL = parser.get("irusUKBaseURL"); - irusUKReportPath = parser.get("irusUKReportPath"); - sarcsReportPathArray = parser.get("sarcsReportPathArray"); - sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); - lareferenciaLogPath = parser.get("lareferenciaLogPath"); - lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); - lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { + processPiwikLogs = true; + } else { + processPiwikLogs = false; + } - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); - usageStatsDBSchema = parser.get("usageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); +// String startingLogPeriodStr = parser.get("startingLogPeriod"); +// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); +// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); +// +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { - processPiwikLogs = true; - } else { - processPiwikLogs = false; - } + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { + recreateDbAndTables = true; + } else { + recreateDbAndTables = false; + } - String startingLogPeriodStr = parser.get("startingLogPeriod"); - Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); - startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { + processLaReferenciaLogs = true; + } else { + processLaReferenciaLogs = false; + } - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { + irusProcessStats = true; + } else { + irusProcessStats = false; + } - numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); - numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { + sarcProcessStats = true; + } else { + sarcProcessStats = false; + } - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { - recreateDbAndTables = true; - } else { - recreateDbAndTables = false; - } + if (parser.get("finalizeStats").toLowerCase().equals("true")) { + finalizeStats = true; + } else { + finalizeStats = false; + } + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { + finalTablesVisibleToImpala = true; + } else { + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + } - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { - processLaReferenciaLogs = true; - } else { - processLaReferenciaLogs = false; - } + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } - if (parser.get("irusProcessStats").toLowerCase().equals("true")) { - irusProcessStats = true; - } else { - irusProcessStats = false; - } + private static Calendar startingLogPeriodStr(Date date) { - irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { - sarcProcessStats = true; - } else { - sarcProcessStats = false; - } - sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); - - if (parser.get("finalizeStats").toLowerCase().equals("true")) { - finalizeStats = true; - } else { - finalizeStats = false; - } - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { - finalTablesVisibleToImpala = true; - } else { - numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); - } - - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - - private static Calendar startingLogPeriodStr(Date date) { - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(date); - return calendar; - - } + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java index 4f34adc04..4439f848e 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.*; @@ -27,45 +28,42 @@ import org.slf4j.LoggerFactory; */ public class IrusStats { - private String irusUKURL; + private String irusUKURL; - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - public IrusStats() throws Exception { - } + public IrusStats() throws Exception { + } - - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + logger.info("Creating irus_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".irus_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created irus_downloads_stats_tmp table"); - logger.info("Creating irus_downloads_stats_tmp table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".irus_downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created irus_downloads_stats_tmp table"); + logger.info("Inserting into irus_downloads_stats_tmp"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into irus_downloads_stats_tmp"); - logger.info("Inserting into irus_downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into irus_downloads_stats_tmp"); + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } - stmt.close(); - //ConnectDB.getHiveConnection().close(); - } - - } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java index ea3ac5948..0d34ebef3 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java @@ -41,8 +41,6 @@ public class LaReferenciaStats { public LaReferenciaStats() throws Exception { } - - public void processLogs() throws Exception { try { logger.info("LaReferencia creating viewsStats"); @@ -62,7 +60,6 @@ public class LaReferenciaStats { } } - public void viewsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); @@ -101,7 +98,7 @@ public class LaReferenciaStats { logger.info("Created la_views_stats_tmp table"); stmt.close(); - ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } private void downloadsStats() throws Exception { @@ -142,8 +139,7 @@ public class LaReferenciaStats { logger.info("Created la_downloads_stats_tmp table"); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } - } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index a165c6eab..253dc03b5 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -1,22 +1,15 @@ package eu.dnetlib.oa.graph.usagestatsbuild.export; -import java.io.*; -import java.net.URLDecoder; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.*; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.sql.Timestamp; /** * @author D. Pierrakos, S. Zoupanos @@ -29,37 +22,51 @@ public class PiwikStatsDB { private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - public PiwikStatsDB() throws Exception { } - public void recreateDBAndTables() throws Exception { this.createDatabase(); // The piwiklog table is not needed since it is built // on top of JSON files - ////////////this.createTmpTables(); + //////////// this.createTmpTables(); } private void createDatabase() throws Exception { + +// try { +// +// stmt = ConnectDB.getHiveConnection().createStatement(); +// +// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); +// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; +// stmt.executeUpdate(dropDatabase); +// } catch (Exception e) { +// logger.error("Failed to drop database: " + e); +// throw new Exception("Failed to drop database: " + e.toString(), e); +// } +// try { stmt = ConnectDB.getHiveConnection().createStatement(); + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema()); - logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); } try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); + logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); } catch (Exception e) { logger.error("Failed to create database: " + e); @@ -67,17 +74,16 @@ public class PiwikStatsDB { } } - public void processLogs() throws Exception { try { - logger.info("ViewsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); viewsStats(); - logger.info("ViewsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); - logger.info("DownloadsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); downloadsStats(); - logger.info("DownloadsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + logger.info("DownloadsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); } catch (Exception e) { logger.error("Failed to process logs: " + e); @@ -85,68 +91,68 @@ public class PiwikStatsDB { } } - - public void viewsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Dropping openaire_result_views_monthly_tmp view"); - String drop_result_views_monthly = "DROP VIEW IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_piwikresult_views_monthly_tmp"; + String drop_result_views_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_piwikresult_views_monthly_tmp"; stmt.executeUpdate(drop_result_views_monthly); logger.info("Dropped openaire_result_views_monthly_tmp view"); logger.info("Creating openaire_result_views_monthly_tmp view"); String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_result_views_monthly_tmp " + - "AS SELECT entity_id AS id, " + - "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + - "AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema() - + ".piwiklog where action='action' and (source_item_type='oaItem' or " + - "source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; + + ".openaire_result_views_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + + "AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".piwiklog where action='action' and (source_item_type='oaItem' or " + + "source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; stmt.executeUpdate(create_result_views_monthly); logger.info("Created openaire_result_views_monthly_tmp table"); logger.info("Dropping openaire_views_stats_tmp table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_views_stats_tmp"; + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp"; stmt.executeUpdate(drop_views_stats); logger.info("Dropped openaire_views_stats_tmp table"); logger.info("Creating openaire_views_stats_tmp table"); String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_views_stats_tmp " + - "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + + ".openaire_views_stats_tmp " + + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_views_stats); logger.info("Created openaire_views_stats_tmp table"); logger.info("Creating openaire_pageviews_stats_tmp table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_pageviews_stats_tmp AS SELECT " + - "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + + ".openaire_pageviews_stats_tmp AS SELECT " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_pageviews_stats); logger.info("Created pageviews_stats table"); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } private void downloadsStats() throws Exception { @@ -154,152 +160,315 @@ public class PiwikStatsDB { ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_result_downloads_monthly_tmp"; + String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp"; stmt.executeUpdate(drop_result_downloads_monthly); logger.info("Dropped openaire_result_downloads_monthly_tmp view"); logger.info("Creating openaire_result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " + - "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + - "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " + - "AND (source_item_type='oaItem' OR source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + - "ORDER BY source, entity_id, month"; + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as downloads, " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' " + + "AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "ORDER BY source, entity_id, month"; stmt.executeUpdate(sql); logger.info("Created openaire_result_downloads_monthly_tmp view"); logger.info("Dropping openaire_downloads_stats_tmp table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_downloads_stats_tmp"; + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp"; stmt.executeUpdate(drop_views_stats); logger.info("Dropped openaire_downloads_stats_tmp table"); logger.info("Creating openaire_downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + - "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id and p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created downloads_stats table"); - logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - sql = "DROP VIEW IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; logger.info("Dropped openaire_result_downloads_monthly_tmp view "); stmt.executeUpdate(sql); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); + } + + public void uploadOldPedocs() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping Pedocs pedocs_views_stats_tmp table + logger.info("Dropping Pedocs pedocs_views_stats_tmp table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + logger.info("Dropped pedocs_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping Pedocs pedocs_downloads_stats table + logger.info("Dropping pedocs_downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats"; + logger.info("Dropped pedocs_downloads_stats table "); + stmt.executeUpdate(sql); + + // Creating Pedocs pedocs_views_stats_tmp table + logger.info("Creating Pedocs pedocs_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id,date,counter_abstract as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_views_stats_tmp table "); + + // Creating Pedocs pedocs_downloads_stats_tmp table + logger.info("Creating Pedocs pedocs_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id, date, counter as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_downloads_stats_tmp table "); + + } + + public void uploadTUDELFTStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_views_stats_tmp table + logger.info("Dropping TUDELFT tudelft_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + logger.info("Dropped tudelft_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_downloads_stats_tmp table + logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + logger.info("Dropped tudelft_downloads_stats_tmp table "); + stmt.executeUpdate(sql); + + // Creating TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_views_monthly_tmp view "); + + // Creating TUDELFT tudelft_views_stats_tmp table + logger.info("Creating TUDELFT tudelft_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_views_stats_tmp table"); + + // Creating TUDELFT tudelft_result_downloads_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_downloads_monthly_tmp view "); + + // Creating TUDELFT tudelft_downloads_stats_tmp table + logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_downloads_stats_tmp table"); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + } public void finalizeStats() throws Exception { stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); - //Dropping views_stats table - logger.info("Dropping views_stats table"); - String sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + // Dropping views_stats table + logger.info("Dropping views_stats table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; logger.info("Dropped views_stats table "); stmt.executeUpdate(sql); - //Dropping downloads_stats table - logger.info("Dropping downloads_stats table"); - sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + // Dropping downloads_stats table + logger.info("Dropping downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; logger.info("Dropped downloads_stats table "); stmt.executeUpdate(sql); - //Dropping page_views_stats table - logger.info("Dropping pageviews_stats table"); - sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + // Dropping page_views_stats table + logger.info("Dropping pageviews_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; logger.info("Dropped pageviews_stats table "); stmt.executeUpdate(sql); - //Creating views_stats table + // Dropping usage_stats table + logger.info("Dropping usage_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + logger.info("Dropped usage_stats table "); + stmt.executeUpdate(sql); + + // Creating views_stats table logger.info("Creating views_stats table"); - String createViewsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".views_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; + String createViewsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".views_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(createViewsStats); logger.info("Created views_stats table"); - - //Inserting OpenAIRE views stats - logger.info("Inserting Openaire data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Openaire views updated to views_stats"); - //Inserting Lareferencia views stats - logger.info("Inserting LaReferencia data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + // Inserting OpenAIRE views stats + logger.info("Inserting Openaire data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; stmt.executeUpdate(sql); - logger.info("LaReferencia views updated to views_stats"); - + logger.info("Openaire views updated to views_stats"); + + // Inserting Pedocs old views stats + logger.info("Inserting Pedocs old data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Pedocs views updated to views_stats"); + + // Inserting TUDELFT views stats + logger.info("Inserting TUDELFT data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("TUDELFT views updated to views_stats"); + + // Inserting Lareferencia views stats + logger.info("Inserting LaReferencia data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("LaReferencia views updated to views_stats"); logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(createDownloadsStats); logger.info("Created downloads_stats table"); - //Inserting OpenAIRE downloads stats + // Inserting OpenAIRE downloads stats logger.info("Inserting OpenAIRE data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; stmt.executeUpdate(sql); + logger.info("Inserted OpenAIRE data to downloads_stats"); - //Inserting Lareferencia downloads stats + // Inserting Pedocs old downloads stats + logger.info("Inserting PeDocs old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted Pedocs data to downloads_stats"); + + // Inserting TUDELFT downloads stats + logger.info("Inserting TUDELFT old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted TUDELFT data to downloads_stats"); + + // Inserting Lareferencia downloads stats logger.info("Inserting LaReferencia data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; stmt.executeUpdate(sql); - logger.info("Lareferencia downloads updated to downloads_stats"); - - //Inserting IRUS downloads stats - logger.info("Inserting IRUS data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("IRUS downloads updated to downloads_stats"); + logger.info("Lareferencia downloads updated to downloads_stats"); - //Inserting SARC-OJS downloads stats - logger.info("Inserting SARC data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + // Inserting IRUS downloads stats + logger.info("Inserting IRUS data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; stmt.executeUpdate(sql); - logger.info("SARC-OJS downloads updated to downloads_stats"); - - - logger.info("Creating pageviews_stats table"); + logger.info("IRUS downloads updated to downloads_stats"); + + // Inserting SARC-OJS downloads stats + logger.info("Inserting SARC data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("SARC-OJS downloads updated to downloads_stats"); + + logger.info("Creating pageviews_stats table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".pageviews_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; + + ".pageviews_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(create_pageviews_stats); logger.info("Created pageviews_stats table"); - - //Inserting OpenAIRE views stats from Portal + + // Inserting OpenAIRE views stats from Portal logger.info("Inserting data to page_views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; stmt.executeUpdate(sql); - + logger.info("Dropping full_dates table"); - String dropFullDates = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".full_dates"; + String dropFullDates = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".full_dates"; stmt.executeUpdate(dropFullDates); logger.info("Dropped full_dates table"); @@ -310,35 +479,80 @@ public class PiwikStatsDB { int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); logger.info("Creating full_dates table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + - "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + - "FROM (SELECT DATE '2016-01-01' AS from_date) p " + - "LATERAL VIEW " + - "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + + "FROM (SELECT DATE '2016-01-01' AS from_date) p " + + "LATERAL VIEW " + + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; stmt.executeUpdate(sql); logger.info("Created full_dates table"); - - logger.info("Inserting data to usage_stats"); - sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + - "SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(sql); - logger.info("Inserted data to usage_stats"); + logger.info("Inserting data to usage_stats"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + + "SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats"); + logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis())); + + logger.info("Dropping view views_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view views_stats on permanent usagestats DB"); + + logger.info("Create view views_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Created view views_stats on permanent usagestats DB"); + + logger.info("Dropping view pageviews_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view pageviews_stats on permanent usagestats DB"); + + logger.info("Create view pageviews_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Created view pageviews_stats on permanent usagestats DB"); + + logger.info("Dropping view downloads_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on downloads_stats on permanent usagestats DB"); + + logger.info("Create view on downloads_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on downloads_stats on permanent usagestats DB"); + + logger.info("Dropping view usage_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on usage_stats on permanent usagestats DB"); + + logger.info("Create view on usage_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on usage_stats on permanent usagestats DB"); + + logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis())); stmt.close(); ConnectDB.getHiveConnection().close(); } - private Connection getConnection() throws SQLException { return ConnectDB.getHiveConnection(); } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java index 2d224075f..880233f00 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.*; @@ -33,74 +34,74 @@ import org.slf4j.LoggerFactory; */ public class SarcStats { - private Statement stmtHive = null; - private Statement stmtImpala = null; + private Statement stmtHive = null; + private Statement stmtImpala = null; - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - public SarcStats() throws Exception { + public SarcStats() throws Exception { // createTables(); - } + } - private void createTables() throws Exception { - try { + private void createTables() throws Exception { + try { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } - public void processSarc() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processSarc() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Creating sarc_downloads_stats_tmp table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created sarc_downloads_stats_tmp table"); + logger.info("Creating sarc_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created sarc_downloads_stats_tmp table"); - logger.info("Inserting into sarc_downloads_stats_tmp"); - String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmt.executeUpdate(insertSarcStats); - logger.info("Inserted into sarc_downloads_stats_tmp"); - - stmt.close(); - //ConnectDB.getHiveConnection().close(); - } + logger.info("Inserting into sarc_downloads_stats_tmp"); + String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + stmt.executeUpdate(insertSarcStats); + logger.info("Inserted into sarc_downloads_stats_tmp"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java index 43abb1681..47986f52a 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.IOException; @@ -17,90 +18,110 @@ import org.slf4j.LoggerFactory; */ public class UsageStatsExporter { - public UsageStatsExporter() { + public UsageStatsExporter() { - } + } - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - public void export() throws Exception { + public void export() throws Exception { - logger.info("Initialising DB properties"); - ConnectDB.init(); + logger.info("Initialising DB properties"); + ConnectDB.init(); // runImpalaQuery(); - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); - logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) { - piwikstatsdb.recreateDBAndTables(); - logger.info("DB-Tables are created "); - } + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) { + piwikstatsdb.recreateDBAndTables(); + logger.info("DB-Tables are created "); + } // else { // piwikstatsdb.createTmpTables(); // logger.info("TmpTables are created "); // } - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing Piwik logs"); + piwikstatsdb.processLogs(); + logger.info("Piwik logs Done"); + logger.info("Processing Pedocs Old Stats"); + piwikstatsdb.uploadOldPedocs(); + logger.info("Processing Pedocs Old Stats Done"); + logger.info("Processing TUDELFT Stats"); + piwikstatsdb.uploadTUDELFTStats(); + logger.info("Processing TUDELFT Stats Done"); - LaReferenciaStats lastats = new LaReferenciaStats(); + } - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(); - - if (ExecuteWorkflow.irusProcessStats) { - logger.info("Processing IRUS"); - irusstats.processIrusStats(); - logger.info("Irus done"); - } + LaReferenciaStats lastats = new LaReferenciaStats(); - SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(); - } - logger.info("Sarc done"); + IrusStats irusstats = new IrusStats(); - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } + if (ExecuteWorkflow.irusProcessStats) { + logger.info("Processing IRUS"); + irusstats.processIrusStats(); + logger.info("Irus done"); + } - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } + SarcStats sarcStats = new SarcStats(); - logger.info("End"); - } + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(); + } + logger.info("Sarc done"); - private void invalidateMetadata() throws SQLException { - Statement stmt = null; + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } - stmt = ConnectDB.getImpalaConnection().createStatement(); + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); + logger.info("End"); + } - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); + private void invalidateMetadata() throws SQLException { + Statement stmt = null; - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); + stmt = ConnectDB.getImpalaConnection().createStatement(); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); - stmt.close(); - ConnectDB.getHiveConnection().close(); - } + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json index 3f121288e..407370ada 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json @@ -1,237 +1,128 @@ [ - { - "paramName": "mat", - "paramLongName": "matomoAuthToken", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "mbu", - "paramLongName": "matomoBaseURL", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "rlp", - "paramLongName": "repoLogPath", - "paramDescription": "nameNode of the source cluster", - "paramRequired": true - }, - { - "paramName": "plp", - "paramLongName": "portalLogPath", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "pmi", - "paramLongName": "portalMatomoID", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "iukbuw", - "paramLongName": "irusUKBaseURL", - "paramDescription": "working directory", - "paramRequired": true - }, - { - "paramName": "iukrp", - "paramLongName": "irusUKReportPath", - "paramDescription": "maximum number of map tasks used in the distcp process", - "paramRequired": true - }, - { - "paramName": "srpa", - "paramLongName": "sarcsReportPathArray", - "paramDescription": "memory for distcp action copying actionsets from remote cluster", - "paramRequired": true - }, - { - "paramName": "srpna", - "paramLongName": "sarcsReportPathNonArray", - "paramDescription": "timeout for distcp copying actions from remote cluster", - "paramRequired": true - }, - { - "paramName": "llp", - "paramLongName": "lareferenciaLogPath", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lbu", - "paramLongName": "lareferenciaBaseURL", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lat", - "paramLongName": "lareferenciaAuthToken", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "urdbs", - "paramLongName": "usageRawDataDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, { - "paramName": "usdbs", - "paramLongName": "usageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "piwikEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ppwl", - "paramLongName": "processPiwikLogs", - "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dpwl", - "paramLongName": "downloadPiwikLogs", - "paramDescription": "download piwik logs?", - "paramRequired": true - }, - { - "paramName": "slp", - "paramLongName": "startingLogPeriod", - "paramDescription": "Starting log period", - "paramRequired": true - }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, - { - "paramName": "npidd", - "paramLongName": "numberOfPiwikIdsToDownload", - "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", - "paramRequired": true - }, - { - "paramName": "nsidd", - "paramLongName": "numberOfSiteIdsToDownload", - "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", - "paramRequired": true - }, - { - "paramName": "lerd", - "paramLongName": "laReferenciaEmptyDirs", - "paramDescription": "Empty LaReferencia directories?", - "paramRequired": true - }, - { - "paramName": "plrl", - "paramLongName": "processLaReferenciaLogs", - "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dlrl", - "paramLongName": "downloadLaReferenciaLogs", - "paramDescription": "download La Referencia logs?", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "irusCreateTablesEmptyDirs", - "paramDescription": "Irus section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "irusDownloadReports", - "paramDescription": "Irus section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "irusProcessStats", - "paramDescription": "Irus section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "irusNumberOfOpendoarsToDownload", - "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "sarcCreateTablesEmptyDirs", - "paramDescription": "Sarc section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "sarcDownloadReports", - "paramDescription": "Sarc section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "sarcProcessStats", - "paramDescription": "Sarc section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "sarcNumberOfIssnToDownload", - "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", - "paramRequired": true - }, - - { - "paramName": "fs", - "paramLongName": "finalizeStats", - "paramDescription": "Create the usage_stats table?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, - { - "paramName": "nodt", - "paramLongName": "numberOfDownloadThreads", - "paramDescription": "Number of download threads", - "paramRequired": true - } + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "urdbs", + "paramLongName": "usageRawDataDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "usagestatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml index 37700539b..71e8a50d6 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -42,42 +42,24 @@ eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow - --matomoAuthToken${matomoAuthToken} - --matomoBaseURL${matomoBaseURL} --repoLogPath${repoLogPath} --portalLogPath${portalLogPath} --portalMatomoID${portalMatomoID} - --irusUKBaseURL${irusUKBaseURL} --irusUKReportPath${irusUKReportPath} --sarcsReportPathArray${sarcsReportPathArray} --sarcsReportPathNonArray${sarcsReportPathNonArray} --lareferenciaLogPath${lareferenciaLogPath} - --lareferenciaBaseURL${lareferenciaBaseURL} - --lareferenciaAuthToken${lareferenciaAuthToken} --dbHiveUrl${hiveJdbcUrl} --dbImpalaUrl${impalaJdbcUrl} --usageRawDataDBSchema${usageRawDataDBSchema} --usageStatsDBSchema${usageStatsDBSchema} + --usagestatsPermanentDBSchema${usagestatsPermanentDBSchema} --statsDBSchema${statsDBSchema} --recreateDbAndTables${recreateDbAndTables} - --piwikEmptyDirs${piwikEmptyDirs} - --downloadPiwikLogs${downloadPiwikLogs} --processPiwikLogs${processPiwikLogs} - --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} - --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} - --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} - --laReferenciaEmptyDirs${laReferenciaEmptyDirs} - --downloadLaReferenciaLogs${downloadLaReferenciaLogs} --processLaReferenciaLogs${processLaReferenciaLogs} - --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} - --irusDownloadReports${irusDownloadReports} --irusProcessStats${irusProcessStats} - --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} - --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} - --sarcDownloadReports${sarcDownloadReports} --sarcProcessStats${sarcProcessStats} - --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} --finalTablesVisibleToImpala${finalTablesVisibleToImpala} --numberOfDownloadThreads${numberOfDownloadThreads}