From b8a3392b5908ebe3e6e07da8232cd8fd79718688 Mon Sep 17 00:00:00 2001 From: Dimitris Date: Fri, 30 Oct 2020 14:07:21 +0200 Subject: [PATCH 01/16] Commit 30102020 --- dhp-workflows/dhp-usage-stats-update/pom.xml | 78 + .../oa/graph/usagestats/export/ConnectDB.java | 125 ++ .../usagestats/export/ExecuteWorkflow.java | 197 +++ .../oa/graph/usagestats/export/IrusStats.java | 417 ++++++ .../export/LaReferenciaDownloadLogs.java | 261 ++++ .../usagestats/export/LaReferenciaStats.java | 423 ++++++ .../usagestats/export/PiwikDownloadLogs.java | 316 +++++ .../graph/usagestats/export/PiwikStatsDB.java | 1255 +++++++++++++++++ .../export/ReadCounterRobotsList.java | 54 + .../oa/graph/usagestats/export/SarcStats.java | 571 ++++++++ .../usagestats/export/UsageStatsExporter.java | 179 +++ .../export/usagestats_parameters.json | 231 +++ .../usagestats/oozie_app/config-default.xml | 38 + .../graph/usagestats/oozie_app/workflow.xml | 90 ++ 14 files changed, 4235 insertions(+) create mode 100644 dhp-workflows/dhp-usage-stats-update/pom.xml create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml new file mode 100644 index 000000000..b56257ee5 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/pom.xml @@ -0,0 +1,78 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-usage-stats-update + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java new file mode 100644 index 000000000..ffc7c74cd --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java @@ -0,0 +1,125 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageStatsDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getUsageStatsDBSchema() { + return ConnectDB.usageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(30000); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(30000); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + return cpds.getConnection(); + + } + +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java new file mode 100644 index 000000000..50b951cbc --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java @@ -0,0 +1,197 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; + static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; + static String lareferenciaBaseURL; + static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + + static boolean piwikEmptyDirs; + static boolean downloadPiwikLogs; + static boolean processPiwikLogs; + + static Calendar startingLogPeriod; + static Calendar endingLogPeriod; + static int numberOfPiwikIdsToDownload; + static int numberOfSiteIdsToDownload; + + static boolean laReferenciaEmptyDirs; + static boolean downloadLaReferenciaLogs; + static boolean processLaReferenciaLogs; + + static boolean irusCreateTablesEmptyDirs; + static boolean irusDownloadReports; + static boolean irusProcessStats; + static int irusNumberOfOpendoarsToDownload; + + static boolean sarcCreateTablesEmptyDirs; + static boolean sarcDownloadReports; + static boolean sarcProcessStats; + static int sarcNumberOfIssnToDownload; + + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; + + static int numberOfDownloadThreads; + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + matomoAuthToken = parser.get("matomoAuthToken"); + matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); + irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); + lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); + lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + recreateDbAndTables = true; + else + recreateDbAndTables = false; + + if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) + piwikEmptyDirs = true; + else + piwikEmptyDirs = false; + + if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) + downloadPiwikLogs = true; + else + downloadPiwikLogs = false; + + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) + processPiwikLogs = true; + else + processPiwikLogs = false; + + String startingLogPeriodStr = parser.get("startingLogPeriod"); + Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); + startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + + String endingLogPeriodStr = parser.get("endingLogPeriod"); + Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); + endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + + numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); + numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + + if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) + laReferenciaEmptyDirs = true; + else + laReferenciaEmptyDirs = false; + + if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) + downloadLaReferenciaLogs = true; + else + downloadLaReferenciaLogs = false; + + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) + processLaReferenciaLogs = true; + else + processLaReferenciaLogs = false; + + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) + irusCreateTablesEmptyDirs = true; + else + irusCreateTablesEmptyDirs = false; + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) + irusDownloadReports = true; + else + irusDownloadReports = false; + if (parser.get("irusProcessStats").toLowerCase().equals("true")) + irusProcessStats = true; + else + irusProcessStats = false; + irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) + sarcCreateTablesEmptyDirs = true; + else + sarcCreateTablesEmptyDirs = false; + if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) + sarcDownloadReports = true; + else + sarcDownloadReports = false; + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) + sarcProcessStats = true; + else + sarcProcessStats = false; + sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); + + if (parser.get("finalizeStats").toLowerCase().equals("true")) + finalizeStats = true; + else + finalizeStats = false; + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else + finalTablesVisibleToImpala = false; + + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + + private static Calendar startingLogPeriodStr(Date date) { + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; + + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java new file mode 100644 index 000000000..749687ec5 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java @@ -0,0 +1,417 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class IrusStats { + + private String irusUKURL; + + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created +// createTmpTables(); + } + + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } + + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + // To see how to apply to the ignore duplicate rules and indexes +// stmt.executeUpdate(sqlCreateTableSushiLog); +// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO sushilog " +// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," +// + "sushilog.rid, sushilog.date " +// + "FROM sushilog " +// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlcreateRuleSushiLog); +// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; +// stmt.executeUpdate(createSushiIndex); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + +// // The following may not be needed - It will be created when JSON tables are created +// private void createTmpTables() throws Exception { +// try { +// +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; +// stmt.executeUpdate(sqlCreateTableSushiLog); +// +// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); +// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; +// // stmt.executeUpdate(sqlCopyPublicSushiLog); +// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO sushilogtmp " +// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," +// + "sushilogtmp.rid, sushilogtmp.date " +// + "FROM sushilogtmp " +// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlcreateRuleSushiLog); +// +// stmt.close(); +// ConnectDB.getConnection().close(); +// log.info("Sushi Tmp Tables Created"); +// } catch (Exception e) { +// log.error("Failed to create tables: " + e); +// throw new Exception("Failed to create tables: " + e.toString(), e); +// } +// } + + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); + + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); + + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); + + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); + + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); + + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); + + logger.info("Inserting into downloads_stats"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats"); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); + + ConnectDB.getHiveConnection().close(); + } + + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + + logger.info("(getIrusRRReport) Getting report: " + reportUrl); + + String text = getJson(reportUrl, "", ""); + + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } + + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 && + ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } + + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } + + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + int batch_size = 0; + + while (start.before(end)) { + // log.info("date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); + + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } + + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + fin.close(); + } + + ConnectDB.getHiveConnection().close(); + + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } + + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + System.out.println("response ====> " + response.toString()); + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java new file mode 100644 index 000000000..0e0e013cf --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java @@ -0,0 +1,261 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); +// this.createTmpTables(); + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); +// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialog " +// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," +// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " +// + "FROM lareferencialog " +// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; +// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); +// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + +// private void createTmpTables() throws Exception { +// +// try { +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; +// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialogtmp " +// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," +// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " +// + "FROM lareferencialogtmp " +// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); +// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); +// +// stmt.close(); +// log.info("Lareferencia Tmp Tables Created"); +// +// } catch (Exception e) { +// log.error("Failed to create tmptables: " + e); +// throw new Exception("Failed to create tmp tables: " + e.toString(), e); +// // System.exit(0); +// } +// } + + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; + + List siteIdsToVisit = new ArrayList(); + + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && + ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + + for (int siteId : siteIdsToVisit) { + logger.info("Now working on piwikId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } + + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { + + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=? GROUP BY timestamp HAVING max(timestamp) is not null"); + st.setInt(1, laReferencialMatomoID); + + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; + + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; + + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) + break; + + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + + } + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java new file mode 100644 index 000000000..347d3de21 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java @@ -0,0 +1,423 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaStats { + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); + + private String logRepoPath; + + private Statement stmt = null; + + private String CounterRobotsURL; + private ArrayList robotsList; + + public LaReferenciaStats(String logRepoPath) throws Exception { + this.logRepoPath = logRepoPath; + this.createTables(); +// this.createTmpTables(); + } + + /* + * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) { + * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } } + */ + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); +// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialog " +// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," +// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " +// + "FROM lareferencialog " +// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; +// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); +// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + +// private void createTmpTables() throws Exception { +// +// try { +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; +// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialogtmp " +// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," +// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " +// + "FROM lareferencialogtmp " +// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); +// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); +// +// stmt.close(); +// log.info("Lareferencia Tmp Tables Created"); +// +// } catch (Exception e) { +// log.error("Failed to create tmptables: " + e); +// throw new Exception("Failed to create tmp tables: " + e.toString(), e); +// // System.exit(0); +// } +// } + + public void processLogs() throws Exception { + try { + logger.info("Processing LaReferencia repository logs"); + processlaReferenciaLog(); + logger.info("LaReferencia repository logs process done"); + + logger.info("LaReferencia removing double clicks"); + removeDoubleClicks(); + logger.info("LaReferencia removed double clicks"); + + logger.info("LaReferencia creating viewsStats"); + viewsStats(); + logger.info("LaReferencia created viewsStats"); + logger.info("LaReferencia creating downloadsStats"); + downloadsStats(); + logger.info("LaReferencia created downloadsStats"); + logger.info("LaReferencia updating Production Tables"); + updateProdTables(); + logger.info("LaReferencia updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processlaReferenciaLog() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping lareferencialogtmp_json table"); + String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json"; + stmt.executeUpdate(drop_lareferencialogtmp_json); + logger.info("Dropped lareferencialogtmp_json table"); + + logger.info("Creating lareferencialogtmp_json"); + String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `repItem` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " timestamp: STRING,\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >,\n" + + " `2`: struct<\n" + + " `customVariablePageValue2`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + " >" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_lareferencialogtmp_json); + logger.info("Created lareferencialogtmp_json"); + + logger.info("Dropping lareferencialogtmp table"); + String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp"; + stmt.executeUpdate(drop_lareferencialogtmp); + logger.info("Dropped lareferencialogtmp table"); + + logger.info("Creating lareferencialogtmp"); + String create_lareferencialogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_lareferencialogtmp); + logger.info("Created lareferencialogtmp"); + + logger.info("Inserting into lareferencialogtmp"); + String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " + + "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " + + "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_lareferencialogtmp); + logger.info("Inserted into lareferencialogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " + + "AND p1.timestamp listHdfsDir(String dir) throws Exception { + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + // log.info("Found hdfs file " + fileName); + fileNames.add(fileName); + } + // hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java new file mode 100644 index 000000000..65816518f --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java @@ -0,0 +1,316 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +import java.net.Authenticator; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + + } + + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } + + class WorkerThread implements Runnable { + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; + + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } + + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) + break; + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } + } + + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); + while (rs.next()) + piwikIdToVisit.add(rs.getInt(1)); + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && + ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + //ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + + logger.info("Now working on piwikId: " + siteId); + + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + //logger.info("Date used " + currDay.toString()); + //Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + //executor.execute(worker);// calling execute method of ExecutorService + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } + } + //executor.shutdown(); + //while (!executor.isTerminated()) { + //} + //System.out.println("Finished all threads"); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) + break; + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java new file mode 100644 index 000000000..6e015acf4 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -0,0 +1,1255 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + // The piwiklog table is not needed since it is built + // on top of JSON files + this.createTmpTables(); + } + + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Piwiklog table - This table should exist + String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLog); + + ///////////////////////////////////////// + // Rule for duplicate inserts @ piwiklog + ///////////////////////////////////////// + + String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log + ////////////////////////////////////////////////// + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private void createTmpTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTmpTablePiwikLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ piwiklogtmp + ////////////////////////////////////////////////// + + ////////////////////////////////////////////////// + // Copy from public.piwiklog to piwiklog + ////////////////////////////////////////////////// + // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; + // stmt.executeUpdate(sqlCopyPublicPiwiklog); + + String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTmpTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log_tmp + ////////////////////////////////////////////////// + + stmt.close(); + + } catch (Exception e) { + logger.error("Failed to create tmptables: " + e); + throw new Exception("Failed to create tmp tables: " + e.toString(), e); + // System.exit(0); + } + } + + public void processLogs() throws Exception { + try { + ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); + this.robotsList = counterRobots.getRobotsPatterns(); + + logger.info("Processing repository logs"); + processRepositoryLog(); + logger.info("Repository logs process done"); + + logger.info("Removing double clicks"); + removeDoubleClicks(); + logger.info("Removing double clicks done"); + + logger.info("Cleaning oai"); + cleanOAI(); + logger.info("Cleaning oai done"); + + logger.info("Processing portal logs"); + processPortalLog(); + logger.info("Portal logs process done"); + + logger.info("Processing portal usagestats"); + portalStats(); + logger.info("Portal usagestats process done"); + + logger.info("ViewsStats processing starts"); + viewsStats(); + logger.info("ViewsStats processing ends"); + + logger.info("DownloadsStats processing starts"); + downloadsStats(); + logger.info("DownloadsStats processing starts"); + + + + logger.info("Updating Production Tables"); + updateProdTables(); + logger.info("Updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processRepositoryLog() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping piwiklogtmp_json table"); + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; + stmt.executeUpdate(drop_piwiklogtmp_json); + logger.info("Dropped piwiklogtmp_json table"); + + logger.info("Creating piwiklogtmp_json"); + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >\n" + + " >,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_piwiklogtmp_json); + logger.info("Created piwiklogtmp_json"); + + logger.info("Dropping piwiklogtmp table"); + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; + stmt.executeUpdate(drop_piwiklogtmp); + logger.info("Dropped piwiklogtmp"); + + logger.info("Creating piwiklogtmp"); + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_piwiklogtmp); + logger.info("Created piwiklogtmp"); + + logger.info("Inserting into piwiklogtmp"); + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_piwiklogtmp); + logger.info("Inserted into piwiklogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "WHERE EXISTS (\n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + + "WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_process_portal_log_tmp_json); + logger.info("Created process_portal_log_tmp_json"); + + logger.info("Droping process_portal_log_tmp table"); + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; + stmt.executeUpdate(drop_process_portal_log_tmp); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Creating process_portal_log_tmp"); + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_process_portal_log_tmp); + logger.info("Created process_portal_log_tmp"); + + logger.info("Inserting into process_portal_log_tmp"); + String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_process_portal_log_tmp); + logger.info("Inserted into process_portal_log_tmp"); + + stmt.close(); + } + + public void portalStats() throws SQLException { + Connection con = ConnectDB.getHiveConnection(); + Statement stmt = con.createStatement(); + con.setAutoCommit(false); + +// Original queries where of the style +// +// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp2, +// openaire_prod_stats_20200821.result_oids roid +// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null +// +// The following query is an example of how queries should be +// +// +// INSERT INTO usagestats_20200907.piwiklogtmp +// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp +// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id +// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); +// +// We should consider if we would like the queries to be as the following +// +// INSERT INTO usagestats_20200907.piwiklogtmp +// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp +// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id +// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND +// roid.oid != ''); + + logger.info("PortalStats - Step 1"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".result_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("PortalStats - Step 2"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + +/* logger.info("PortalStats - Step 3"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".organization_oids roid WHERE roid.id IS NOT NULL)"; +// stmt.executeUpdate(sql); + stmt.close(); +*/ + logger.info("PortalStats - Step 3"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".project_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + con.close(); + } + + private void cleanOAI() throws Exception { + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning oai - Step 1"); + stmt = ConnectDB.getHiveConnection().createStatement(); + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 2"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 3"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 4"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 5"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 6"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 7"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 8"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 9"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 10"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 11"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 12"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 13"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 14"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 15"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 16"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 17"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 18"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 19"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 20"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 21"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 22"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 23"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 24"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 25"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 26"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 27"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 28"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 29"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Done, closing connection"); + ConnectDB.getHiveConnection().close(); + } + + private String processPortalURL(String url) { + + if (url.indexOf("explore.openaire.eu") > 0) { + try { + url = URLDecoder.decode(url, "UTF-8"); + } catch (Exception e) { + logger.info("Error when decoding the following URL: " + url); + } + if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); + } else if (url.indexOf("datasource=") > 0 + && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { + url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); + } else if (url.indexOf("datasourceFilter=") > 0 + && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); + } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); + } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); + } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 + && !url.contains("oai:dnet:corda")) { + url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); + } else if (url.indexOf("organizationId=") > 0 + && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { + url = "organization|" + + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); + } else { + url = ""; + } + } else { + url = ""; + } + + return url; + } + + private void updateProdTables() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Inserting data to piwiklog"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to pageviews_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; + stmt.executeUpdate(sql); + +/* logger.info("Dropping table views_stats_tmp"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping table downloads_stats_tmp"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping table pageviews_stats_tmp"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping table process_portal_log_tmp"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); +*/ + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java new file mode 100644 index 000000000..1708a1c64 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java @@ -0,0 +1,54 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagestats.export; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; + +import org.json.JSONException; +import org.json.simple.JSONArray; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class ReadCounterRobotsList { + + private ArrayList robotsPatterns = new ArrayList(); + private String COUNTER_ROBOTS_URL; + + public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { + COUNTER_ROBOTS_URL = url; + robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); + } + + private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { + InputStream is = new URL(url).openStream(); + JSONParser parser = new JSONParser(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); + JSONArray jsonArray = (JSONArray) parser.parse(reader); + for (Object aJsonArray : jsonArray) { + org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; + robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); + } + return robotsPatterns; + } + + public ArrayList getRobotsPatterns() { + return robotsPatterns; + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java new file mode 100644 index 000000000..295e98280 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java @@ -0,0 +1,571 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class SarcStats { + + private Statement stmtHive = null; + private Statement stmtImpala = null; + + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + + public SarcStats() throws Exception { +// createTables(); + } + + private void createTables() throws Exception { + try { + + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void reCreateLogDirs() throws IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); + + logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); + + logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); + + logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); + } + + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sarc_sushilogtmp_json_array table"); + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); + logger.info("Dropped sarc_sushilogtmp_json_array table"); + + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + + ConnectDB.getHiveConnection().close(); + } + + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); + + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[] { + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); + + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 && + ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } + + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } + + } + + public void finalizeSarcStats() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); + + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmtHive.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); + + logger.info("Dropping sarc_sushilogtmp_impala table"); + String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala"; + stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); + logger.info("Dropped sarc_sushilogtmp_impala table"); + + logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); + String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala " + + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(createSarcSushilogtmpImpala); + logger.info("Created sarc_sushilogtmp_impala"); + + logger.info("Making sarc_sushilogtmp visible to impala"); + String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala;"; + stmtImpala.executeUpdate(invalidateMetadata); + + logger.info("Dropping downloads_stats_impala table"); + String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala"; + stmtHive.executeUpdate(drop_downloads_stats_impala); + logger.info("Dropped downloads_stats_impala table"); + + logger.info("Making downloads_stats_impala deletion visible to impala"); + try { + String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala;"; + stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); + } catch (SQLException sqle) { + } + + // We run the following query in Impala because it is faster + logger.info("Creating downloads_stats_impala"); + String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala AS " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".datasource_results dr, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND dr.id=d.id AND dr.result=ro.id AND " + + "s.rid=ro.pid AND ro.type='Digital Object Identifier' AND metric_type='ft_total' AND s.source='SARC-OJS'"; + stmtImpala.executeUpdate(createDownloadsStatsImpala); + logger.info("Creating downloads_stats_impala"); + + // Insert into downloads_stats + logger.info("Inserting data from downloads_stats_impala into downloads_stats"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats SELECT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; + stmtHive.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats"); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmtHive.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } + + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); + + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); + + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } + // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } + + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + + for (Object aJsonArray : jsonArray) { + + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); + + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } + + finArray.close(); + finNonArray.close(); + + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) + fileArray.delete(); + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) + fileNonArray.delete(); + + } + + dfs.close(); + + ConnectDB.getHiveConnection().close(); + } + + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) + renameKeysRecursively(delimiter, (JSONArray) jjval); + else if (jjval instanceof JSONObject) + renameKeysRecursively(delimiter, (JSONObject) jjval); + // All other types of vals + else + ; + } + } + + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { + + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; + + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); + + if (jval instanceof JSONObject) + renameKeysRecursively(delimiter, (JSONObject) jval); + + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } + + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); +// return null; +// throw new Exception("Failed to get URL: " + e.toString(), e); + } + return ""; + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java new file mode 100644 index 000000000..405b58bd5 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -0,0 +1,179 @@ + +package eu.dnetlib.oa.graph.usagestats.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + +// runImpalaQuery(); + + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) + piwikstatsdb.recreateDBAndTables(); + ; + + logger.info("Initializing the download logs module"); + PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + + if (ExecuteWorkflow.piwikEmptyDirs) { + logger.info("Recreating Piwik log directories"); + piwikstatsdb.reCreateLogDirs(); + } + + // Downloading piwik logs (also managing directory creation) + if (ExecuteWorkflow.downloadPiwikLogs) { + logger.info("Downloading piwik logs"); + piwd + .GetOpenAIRELogs( + ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); + } + logger.info("Downloaded piwik logs"); + + // Create DB tables, insert/update statistics + String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + piwikstatsdb.setCounterRobotsURL(cRobotsUrl); + + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } + + logger.info("Creating LaReferencia tables"); + LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + ExecuteWorkflow.lareferenciaAuthToken); + + if (ExecuteWorkflow.laReferenciaEmptyDirs) { + logger.info("Recreating LaReferencia log directories"); + lrf.reCreateLogDirs(); + } + + if (ExecuteWorkflow.downloadLaReferenciaLogs) { + logger.info("Downloading LaReferencia logs"); + lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); + logger.info("Downloaded LaReferencia logs"); + } + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } + + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); + if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { + logger.info("Creating Irus Stats tables"); + irusstats.createTables(); + logger.info("Created Irus Stats tables"); + + logger.info("Re-create log dirs"); + irusstats.reCreateLogDirs(); + logger.info("Re-created log dirs"); + } + + if (ExecuteWorkflow.irusDownloadReports) { + irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); + } + if (ExecuteWorkflow.irusProcessStats) { + irusstats.processIrusStats(); + logger.info("Irus done"); + } + + SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { + sarcStats.reCreateLogDirs(); + } + if (ExecuteWorkflow.sarcDownloadReports) { + sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + } + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + sarcStats.finalizeSarcStats(); + } + logger.info("Sarc done"); + + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json new file mode 100644 index 000000000..988c23b48 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json @@ -0,0 +1,231 @@ +[ + { + "paramName": "mat", + "paramLongName": "matomoAuthToken", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "mbu", + "paramLongName": "matomoBaseURL", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukbuw", + "paramLongName": "irusUKBaseURL", + "paramDescription": "working directory", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lbu", + "paramLongName": "lareferenciaBaseURL", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lat", + "paramLongName": "lareferenciaAuthToken", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "piwikEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dpwl", + "paramLongName": "downloadPiwikLogs", + "paramDescription": "download piwik logs?", + "paramRequired": true + }, + { + "paramName": "slp", + "paramLongName": "startingLogPeriod", + "paramDescription": "Starting log period", + "paramRequired": true + }, + { + "paramName": "elp", + "paramLongName": "endingLogPeriod", + "paramDescription": "Ending log period", + "paramRequired": true + }, + { + "paramName": "npidd", + "paramLongName": "numberOfPiwikIdsToDownload", + "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", + "paramRequired": true + }, + { + "paramName": "nsidd", + "paramLongName": "numberOfSiteIdsToDownload", + "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", + "paramRequired": true + }, + { + "paramName": "lerd", + "paramLongName": "laReferenciaEmptyDirs", + "paramDescription": "Empty LaReferencia directories?", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dlrl", + "paramLongName": "downloadLaReferenciaLogs", + "paramDescription": "download La Referencia logs?", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "irusCreateTablesEmptyDirs", + "paramDescription": "Irus section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "irusDownloadReports", + "paramDescription": "Irus section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "irusNumberOfOpendoarsToDownload", + "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "sarcCreateTablesEmptyDirs", + "paramDescription": "Sarc section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "sarcDownloadReports", + "paramDescription": "Sarc section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "sarcNumberOfIssnToDownload", + "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", + "paramRequired": true + }, + + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml new file mode 100644 index 000000000..8d62a85a9 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml @@ -0,0 +1,90 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.usagestats.export.ExecuteWorkflow + --matomoAuthToken${matomoAuthToken} + --matomoBaseURL${matomoBaseURL} + --repoLogPath${repoLogPath} + --portalLogPath${portalLogPath} + --portalMatomoID${portalMatomoID} + --irusUKBaseURL${irusUKBaseURL} + --irusUKReportPath${irusUKReportPath} + --sarcsReportPathArray${sarcsReportPathArray} + --sarcsReportPathNonArray${sarcsReportPathNonArray} + --lareferenciaLogPath${lareferenciaLogPath} + --lareferenciaBaseURL${lareferenciaBaseURL} + --lareferenciaAuthToken${lareferenciaAuthToken} + --dbHiveUrl${hiveJdbcUrl} + --dbImpalaUrl${impalaJdbcUrl} + --usageStatsDBSchema${usageStatsDBSchema} + --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} + --piwikEmptyDirs${piwikEmptyDirs} + --downloadPiwikLogs${downloadPiwikLogs} + --processPiwikLogs${processPiwikLogs} + --startingLogPeriod${startingLogPeriod} + --endingLogPeriod${endingLogPeriod} + --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} + --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} + --laReferenciaEmptyDirs${laReferenciaEmptyDirs} + --downloadLaReferenciaLogs${downloadLaReferenciaLogs} + --processLaReferenciaLogs${processLaReferenciaLogs} + --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} + --irusDownloadReports${irusDownloadReports} + --irusProcessStats${irusProcessStats} + --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} + --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} + --sarcDownloadReports${sarcDownloadReports} + --sarcProcessStats${sarcProcessStats} + --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} + --finalizeStats${finalizeStats} + --finalTablesVisibleToImpala${finalTablesVisibleToImpala} + --numberOfDownloadThreads${numberOfDownloadThreads} + + + + + + + + From 32bf943979f5e111f0841bdeb7c61a2aa7cbeb1b Mon Sep 17 00:00:00 2001 From: Dimitris Date: Mon, 2 Nov 2020 09:08:25 +0200 Subject: [PATCH 02/16] Changes to download only updates --- .../oa/graph/usagestats/export/ConnectDB.java | 52 +- .../oa/graph/usagestats/export/IrusStats.java | 600 +++++------ .../export/LaReferenciaDownloadLogs.java | 324 +++--- .../usagestats/export/PiwikDownloadLogs.java | 145 +-- .../graph/usagestats/export/PiwikStatsDB.java | 85 +- .../oa/graph/usagestats/export/SarcStats.java | 929 +++++++++--------- .../usagestats/export/UsageStatsExporter.java | 2 +- 7 files changed, 1072 insertions(+), 1065 deletions(-) diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java index ffc7c74cd..29dd5648b 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java @@ -78,20 +78,20 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(30000); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); return cpds.getConnection(); } @@ -103,23 +103,23 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(30000); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); + return cpds.getConnection(); } -} +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java index 749687ec5..6947381c9 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java @@ -1,4 +1,3 @@ - package eu.dnetlib.oa.graph.usagestats.export; import java.io.*; @@ -28,38 +27,38 @@ import org.slf4j.LoggerFactory; */ public class IrusStats { - private String irusUKURL; + private String irusUKURL; - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - public IrusStats(String irusUKURL) throws Exception { - this.irusUKURL = irusUKURL; - // The following may not be needed - It will be created when JSON tables are created + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created // createTmpTables(); - } + } - public void reCreateLogDirs() throws Exception { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - } + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } - public void createTables() throws Exception { - try { - logger.info("Creating sushilog"); - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog(source STRING, " + - "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + - "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableSushiLog); - logger.info("Created sushilog"); + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); - // To see how to apply to the ignore duplicate rules and indexes + // To see how to apply to the ignore duplicate rules and indexes // stmt.executeUpdate(sqlCreateTableSushiLog); // String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " // + " ON INSERT TO sushilog " @@ -70,15 +69,14 @@ public class IrusStats { // stmt.executeUpdate(sqlcreateRuleSushiLog); // String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; // stmt.executeUpdate(createSushiIndex); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } // // The following may not be needed - It will be created when JSON tables are created // private void createTmpTables() throws Exception { @@ -107,311 +105,315 @@ public class IrusStats { // throw new Exception("Failed to create tables: " + e.toString(), e); // } // } + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); - logger.info("Dropping sushilogtmp_json table"); - String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".sushilogtmp_json"; - stmt.executeUpdate(dropSushilogtmpJson); - logger.info("Dropped sushilogtmp_json table"); + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); - logger.info("Creating irus_sushilogtmp_json table"); - String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + - " `ItemIdentifier` ARRAY<\n" + - " struct<\n" + - " Type: STRING,\n" + - " Value: STRING\n" + - " >\n" + - " >,\n" + - " `ItemPerformance` ARRAY<\n" + - " struct<\n" + - " `Period`: struct<\n" + - " `Begin`: STRING,\n" + - " `End`: STRING\n" + - " >,\n" + - " `Instance`: struct<\n" + - " `Count`: STRING,\n" + - " `MetricType`: STRING\n" + - " >\n" + - " >\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(createSushilogtmpJson); - logger.info("Created irus_sushilogtmp_json table"); + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); - logger.info("Dropping irus_sushilogtmp table"); - String dropSushilogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".irus_sushilogtmp"; - stmt.executeUpdate(dropSushilogtmp); - logger.info("Dropped irus_sushilogtmp table"); + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); - logger.info("Creating irus_sushilogtmp table"); - String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp(source STRING, repository STRING, " + - "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + - "tblproperties('transactional'='true')"; - stmt.executeUpdate(createSushilogtmp); - logger.info("Created irus_sushilogtmp table"); + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); - logger.info("Inserting to irus_sushilogtmp table"); - String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + - "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + - "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + - "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + - "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + - "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + - "WHERE `ItemIdent`.`Type`= 'OAI'"; - stmt.executeUpdate(insertSushilogtmp); - logger.info("Inserted to irus_sushilogtmp table"); + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " + - "(`source` string, " + - "`repository_id` string, " + - "`result_id` string, " + - "`date` string, " + - "`count` bigint, " + - "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); + logger.info("Inserting into downloads_stats"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats"); - logger.info("Inserting into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT s.source, d.id AS repository_id, " + - "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + - ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " + - "(`source` string, " + - "`repository_id` string, " + - "`rid` string, " + - "`date` string, " + - "`metric_type` string, " + - "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); - logger.info("Inserting to sushilog table"); - String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + - ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(insertToShushilog); - logger.info("Inserted to sushilog table"); + ConnectDB.getHiveConnection().close(); + } - ConnectDB.getHiveConnection().close(); - } + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); - public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + - sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + - "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + logger.info("(getIrusRRReport) Getting report: " + reportUrl); - logger.info("(getIrusRRReport) Getting report: " + reportUrl); + String text = getJson(reportUrl, "", ""); - String text = getJson(reportUrl, "", ""); + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } - List opendoarsToVisit = new ArrayList(); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - int i = 0; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); - for (Object identifier : itemIdentifier) { - JSONObject opendoar = (JSONObject) identifier; - if (opendoar.get("Type").toString().equals("OpenDOAR")) { - i++; - opendoarsToVisit.add(opendoar.get("Value").toString()); - break; - } - } - // break; - } + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); - logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 + && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } - if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 && - ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - } + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); - logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } - for (String opendoar : opendoarsToVisit) { - logger.info("Now working on openDoar: " + opendoar); - this.getIrusIRReport(opendoar, irusUKReportPath); - } + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } - logger.info("(getIrusRRReport) Finished with report: " + reportUrl); - } + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { - private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); - logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + ConnectDB.getHiveConnection().setAutoCommit(false); - ConnectDB.getHiveConnection().setAutoCommit(false); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + int batch_size = 0; - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, "opendoar____::" + opendoar); - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - } - } - rs_date.close(); - int batch_size = 0; + if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); + } else { + while (start.before(end)) { + logger.info("date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); - while (start.before(end)) { - // log.info("date: " + simpleDateFormat.format(start.getTime())); - String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) - + "&RepositoryIdentifier=opendoar%3A" + opendoar - + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; - start.add(Calendar.MONTH, 1); + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } - logger.info("Downloading file: " + reportUrl); - String text = getJson(reportUrl, "", ""); - if (text == null) { - continue; - } + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); - FileSystem fs = FileSystem.get(new Configuration()); - String filePath = irusUKReportPath + "/" + "IrusIRReport_" + - opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePath); - FSDataOutputStream fin = fs.create(new Path(filePath), true); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - if (jsonArray == null) { - continue; - } - String oai = ""; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - fin.write(jsonObjectRow.toJSONString().getBytes()); - fin.writeChar('\n'); - } + fin.close(); + } - fin.close(); - } + } + //ConnectDB.getHiveConnection().close(); - ConnectDB.getHiveConnection().close(); + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } - logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); - } + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); - private String getJson(String url) throws Exception { - try { - System.out.println("===> Connecting to: " + url); - URL website = new URL(url); - System.out.println("Connection url -----> " + url); - URLConnection connection = website.openConnection(); - - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - System.out.println("response ====> " + response.toString()); + System.out.println("response ====> " + response.toString()); - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - private String getJson(String url, String username, String password) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL", e); - return null; - } - } + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java index 0e0e013cf..7a61b1f46 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java @@ -1,4 +1,3 @@ - package eu.dnetlib.oa.graph.usagestats.export; import java.io.*; @@ -28,49 +27,49 @@ import org.slf4j.LoggerFactory; */ public class LaReferenciaDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); - public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - this.createTables(); + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); // this.createTmpTables(); - } + } - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + - "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + - "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + - "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); // String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " // + " ON INSERT TO lareferencialog " // + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," @@ -81,16 +80,16 @@ public class LaReferenciaDownloadLogs { // stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); // stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } // private void createTmpTables() throws Exception { // @@ -115,147 +114,152 @@ public class LaReferenciaDownloadLogs { // // System.exit(0); // } // } + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } - private String getPiwikLogUrl() { - return piwikUrl + "/"; - } + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); - private String getJson(String url) throws Exception { - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { - String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; - List siteIdsToVisit = new ArrayList(); + List siteIdsToVisit = new ArrayList(); - // Getting all the siteIds in a list for logging reasons & limiting the list - // to the max number of siteIds - content = getJson(baseApiUrl); - JSONParser parser = new JSONParser(); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); - } - logger.info("Found the following siteIds for download: " + siteIdsToVisit); + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && - ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - for (int siteId : siteIdsToVisit) { - logger.info("Now working on piwikId: " + siteId); - this.GetLaReFerenciaLogs(repoLogsPath, siteId); - } - } + for (int siteId : siteIdsToVisit) { + logger.info("Now working on LaReferencia MatomoId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } - public void GetLaReFerenciaLogs(String repoLogsPath, - int laReferencialMatomoID) throws Exception { + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { - logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + - ".lareferencialog WHERE matomoid=? GROUP BY timestamp HAVING max(timestamp) is not null"); - st.setInt(1, laReferencialMatomoID); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=?"); + st.setInt(1, laReferencialMatomoID); + Date dateMax = null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - } - } - rs_date.close(); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - Date date = currDay.getTime(); - logger - .info( - "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " - + sdf.format(date)); + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); + } else { + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - outFolder = repoLogsPath; + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), - true); + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - int i = 0; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; - JSONParser parser = new JSONParser(); - do { - String apiUrl = baseApiUrl; + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) - break; + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - fin.write(jsonObjectRaw.toJSONString().getBytes()); - fin.writeChar('\n'); - } + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } - logger - .info( - "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID - + " and for " - + sdf.format(date)); - i++; - } while (true); - fin.close(); - - } - } + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + } + } + } } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java index 65816518f..7a64e48d2 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java @@ -204,6 +204,9 @@ public class PiwikDownloadLogs { logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { // Setting the starting period Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); logger.info("Starting period for log download: " + sdf.format(start.getTime())); @@ -214,9 +217,6 @@ public class PiwikDownloadLogs { end.add(Calendar.DAY_OF_MONTH, -1); logger.info("Ending period for log download: " + sdf.format(end.getTime())); - //ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - logger.info("Now working on piwikId: " + siteId); PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION @@ -224,7 +224,7 @@ public class PiwikDownloadLogs { "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog WHERE source=?"); st.setInt(1, siteId); - + Date dateMax=null; ResultSet rs_date = st.executeQuery(); while (rs_date.next()) { logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); @@ -232,85 +232,92 @@ public class PiwikDownloadLogs { if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") && !rs_date.getString(1).equals("")) { start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); } - } + } rs_date.close(); for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - //logger.info("Date used " + currDay.toString()); - //Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - //executor.execute(worker);// calling execute method of ExecutorService - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); + + if(dateMax!=null && currDay.getTime().compareTo(dateMax)<=0) + logger.info("Date found in logs "+dateMax+ " and not downloanding Matomo logs for "+siteId); + else + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } } - //executor.shutdown(); - //while (!executor.isTerminated()) { - //} - //System.out.println("Finished all threads"); + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); } - - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); } - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) + break; - int i = 0; + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + writtenBytes += jsonObjectRawBytes.length + 1; + } - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + i++; + } while (true); - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) - break; - - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - writtenBytes += jsonObjectRawBytes.length + 1; - } - - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); - - i++; - } while (true); - - fs.close(); - } + fs.close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java index 6e015acf4..6625c381b 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -199,14 +199,14 @@ public class PiwikStatsDB { cleanOAI(); logger.info("Cleaning oai done"); - logger.info("Processing portal logs"); + logger.info("Processing portal logs"); processPortalLog(); logger.info("Portal logs process done"); logger.info("Processing portal usagestats"); portalStats(); logger.info("Portal usagestats process done"); - + logger.info("ViewsStats processing starts"); viewsStats(); logger.info("ViewsStats processing ends"); @@ -215,8 +215,6 @@ public class PiwikStatsDB { downloadsStats(); logger.info("DownloadsStats processing starts"); - - logger.info("Updating Production Tables"); updateProdTables(); logger.info("Updated Production Tables"); @@ -313,7 +311,7 @@ public class PiwikStatsDB { "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + - "WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + "AND p1.timestamp\n" + - " >,\n" + - " `ItemPerformance` struct<\n" + - " `Period`: struct<\n" + - " `Begin`: STRING,\n" + - " `End`: STRING\n" + - " >,\n" + - " `Instance`: struct<\n" + - " `Count`: STRING,\n" + - " `MetricType`: STRING\n" + - " >\n" + - " >\n" + - ")" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + sarcsReportPathArray + "/'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - logger.info("Created sarc_sushilogtmp_json_array table"); + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); - logger.info("Dropping sarc_sushilogtmp_json_non_array table"); - String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp_json_non_array table"); - String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + - " `ItemIdentifier` struct<\n" + - " `Type`: STRING,\n" + - " `Value`: STRING\n" + - " >,\n" + - " `ItemPerformance` struct<\n" + - " `Period`: struct<\n" + - " `Begin`: STRING,\n" + - " `End`: STRING\n" + - " >,\n" + - " `Instance`: struct<\n" + - " `Count`: STRING,\n" + - " `MetricType`: STRING\n" + - " >\n" + - " >" + - ")" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + sarcsReportPathNonArray + "/'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - logger.info("Created sarc_sushilogtmp_json_non_array table"); + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp table"); - String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp(source STRING, repository STRING, " + - "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + - "tblproperties('transactional'='true')"; - stmt.executeUpdate(create_sarc_sushilogtmp); - logger.info("Created sarc_sushilogtmp table"); + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + - "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + - " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + - "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + - "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + - "WHERE `ItemIdent`.`Type`='DOI'"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + - "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + - "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + - "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - ConnectDB.getHiveConnection().close(); - } + ConnectDB.getHiveConnection().close(); + } - public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " + - "(`source` string, " + - "`repository` string, " + - "`rid` string, " + - "`date` string, " + - "`metric_type` string, " + - "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); - logger.info("Dropping sarc_sushilogtmp table"); - String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".sarc_sushilogtmp"; - stmt.executeUpdate(drop_sarc_sushilogtmp); - logger.info("Dropped sarc_sushilogtmp table"); - ConnectDB.getHiveConnection().close(); + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); - List issnAndUrls = new ArrayList(); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" - }); - issnAndUrls.add(new String[] { - "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" - }); - issnAndUrls.add(new String[] { - "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" - }); + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[]{ + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); - if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 && - ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); - issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); - } + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 + && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } - logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); - for (String[] issnAndUrl : issnAndUrls) { - logger.info("Now working on ISSN: " + issnAndUrl[1]); - getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); - } + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } - } + } - public void finalizeSarcStats() throws Exception { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - stmtImpala = ConnectDB.getImpalaConnection().createStatement(); + public void finalizeSarcStats() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " + - "(`source` string, " + - "`repository_id` string, " + - "`result_id` string, " + - "`date` string, " + - "`count` bigint, " + - "`openaire` bigint)"; - stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmtHive.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); - logger.info("Dropping sarc_sushilogtmp_impala table"); - String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".sarc_sushilogtmp_impala"; - stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); - logger.info("Dropped sarc_sushilogtmp_impala table"); + logger.info("Dropping sarc_sushilogtmp_impala table"); + String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala"; + stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); + logger.info("Dropped sarc_sushilogtmp_impala table"); - logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); - String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala " + - "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(createSarcSushilogtmpImpala); - logger.info("Created sarc_sushilogtmp_impala"); + logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); + String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala " + + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(createSarcSushilogtmpImpala); + logger.info("Created sarc_sushilogtmp_impala"); - logger.info("Making sarc_sushilogtmp visible to impala"); - String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala;"; - stmtImpala.executeUpdate(invalidateMetadata); + logger.info("Making sarc_sushilogtmp visible to impala"); + String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala;"; + stmtImpala.executeUpdate(invalidateMetadata); - logger.info("Dropping downloads_stats_impala table"); - String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats_impala"; - stmtHive.executeUpdate(drop_downloads_stats_impala); - logger.info("Dropped downloads_stats_impala table"); + logger.info("Dropping downloads_stats_impala table"); + String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala"; + stmtHive.executeUpdate(drop_downloads_stats_impala); + logger.info("Dropped downloads_stats_impala table"); - logger.info("Making downloads_stats_impala deletion visible to impala"); - try { - String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala;"; - stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); - } catch (SQLException sqle) { - } + logger.info("Making downloads_stats_impala deletion visible to impala"); + try { + String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala;"; + stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); + } catch (SQLException sqle) { + } - // We run the following query in Impala because it is faster - logger.info("Creating downloads_stats_impala"); - String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala AS " + - "SELECT s.source, d.id AS repository_id, " + - "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + - "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + - ConnectDB.getStatsDBSchema() + ".datasource_results dr, " + - ConnectDB.getStatsDBSchema() + ".result_pids ro " + - "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND dr.id=d.id AND dr.result=ro.id AND " + - "s.rid=ro.pid AND ro.type='Digital Object Identifier' AND metric_type='ft_total' AND s.source='SARC-OJS'"; - stmtImpala.executeUpdate(createDownloadsStatsImpala); - logger.info("Creating downloads_stats_impala"); + // We run the following query in Impala because it is faster + logger.info("Creating downloads_stats_impala"); + String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala AS " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".datasource_results dr, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND dr.id=d.id AND dr.result=ro.id AND " + + "s.rid=ro.pid AND ro.type='Digital Object Identifier' AND metric_type='ft_total' AND s.source='SARC-OJS'"; + stmtImpala.executeUpdate(createDownloadsStatsImpala); + logger.info("Creating downloads_stats_impala"); - // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats SELECT * " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; - stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); + // Insert into downloads_stats + logger.info("Inserting data from downloads_stats_impala into downloads_stats"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats SELECT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; + stmtHive.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats"); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " + - "(`source` string, " + - "`repository_id` string, " + - "`rid` string, " + - "`date` string, " + - "`metric_type` string, " + - "`count` int)"; - stmtHive.executeUpdate(createSushilog); - logger.info("Created sushilog table"); + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmtHive.executeUpdate(createSushilog); + logger.info("Created sushilog table"); - // Insert into sushilog - logger.info("Inserting into sushilog"); - String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(insertSushiLog); - logger.info("Inserted into sushilog"); + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - } + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } - public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, - String url, String issn) throws Exception { - logger.info("Processing SARC! issn: " + issn + " with url: " + url); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, issn); - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - } - } - rs_date.close(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - // Creating the needed configuration for the correct storing of data - Configuration config = new Configuration(); - config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); - config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); - config - .set( - "fs.hdfs.impl", - org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - config - .set( - "fs.file.impl", - org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem dfs = FileSystem.get(config); + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); - while (start.before(end)) { - String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); - start.add(Calendar.MONTH, 1); + if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); + } else { + + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); - logger.info("(getARReport) Getting report: " + reportUrl); - String text = getJson(reportUrl); - if (text == null) { - continue; - } + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } - JSONParser parser = new JSONParser(); - JSONObject jsonObject = null; - try { - jsonObject = (JSONObject) parser.parse(text); - } - // if there is a parsing error continue with the next url - catch (ParseException pe) { - continue; - } + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } - jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("sc:Report"); - if (jsonObject == null) { - continue; - } - jsonObject = (JSONObject) jsonObject.get("c:Report"); - jsonObject = (JSONObject) jsonObject.get("c:Customer"); - Object obj = jsonObject.get("c:ReportItems"); - JSONArray jsonArray = new JSONArray(); - if (obj instanceof JSONObject) { - jsonArray.add(obj); - } else { - jsonArray = (JSONArray) obj; - // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); - } - if (jsonArray == null) { - continue; - } + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + - simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathArray); - FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + - simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathNonArray); - FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); - for (Object aJsonArray : jsonArray) { + for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - renameKeysRecursively(":", jsonObjectRow); + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); - if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { - finNonArray.write(jsonObjectRow.toJSONString().getBytes()); - finNonArray.writeChar('\n'); - } else { - finArray.write(jsonObjectRow.toJSONString().getBytes()); - finArray.writeChar('\n'); - } - } + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } - finArray.close(); - finNonArray.close(); + finArray.close(); + finNonArray.close(); - // Check the file size and if it is too big, delete it - File fileArray = new File(filePathArray); + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); if (fileArray.length() == 0) - fileArray.delete(); - File fileNonArray = new File(filePathNonArray); + fileArray.delete(); + File fileNonArray = new File(filePathNonArray); if (fileNonArray.length() == 0) - fileNonArray.delete(); + fileNonArray.delete(); - } + } - dfs.close(); + dfs.close(); + } + //ConnectDB.getHiveConnection().close(); + } - ConnectDB.getHiveConnection().close(); - } - - private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { - for (Object jjval : givenJsonObj) { - if (jjval instanceof JSONArray) - renameKeysRecursively(delimiter, (JSONArray) jjval); - else if (jjval instanceof JSONObject) - renameKeysRecursively(delimiter, (JSONObject) jjval); - // All other types of vals - else + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jjval); + } else if (jjval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jjval); + } // All other types of vals + else ; - } - } + } + } - private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { - Set jkeys = new HashSet(givenJsonObj.keySet()); - for (String jkey : jkeys) { + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { - String[] splitArray = jkey.split(delimiter); - String newJkey = splitArray[splitArray.length - 1]; + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; - Object jval = givenJsonObj.get(jkey); - givenJsonObj.remove(jkey); - givenJsonObj.put(newJkey, jval); + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); - if (jval instanceof JSONObject) - renameKeysRecursively(delimiter, (JSONObject) jval); + if (jval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jval); + } - if (jval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jval); - } - } - } + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } - private String getJson(String url) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { - // Logging error and silently continuing - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); // return null; // throw new Exception("Failed to get URL: " + e.toString(), e); - } - return ""; - } + } + return ""; + } } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index 405b58bd5..ae901dfa5 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -173,7 +173,7 @@ public class UsageStatsExporter { sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); - stmt.close(); + stmt.close(); ConnectDB.getHiveConnection().close(); } } From 3e24c9b176736bd1e6c8bbc4314750dd6863ce0f Mon Sep 17 00:00:00 2001 From: Dimitris Date: Sat, 14 Nov 2020 18:42:07 +0200 Subject: [PATCH 03/16] Changes 14112020 --- .../dhp-usage-datasets-stats-update/pom.xml | 79 ++ .../datasetsusagestats/export/ConnectDB.java | 202 +++ .../DownloadReportsListFromDatacite.java | 97 ++ .../export/ExecuteWorkflow.java | 70 + .../export/UsageStatsExporter.java | 236 +++ .../datasets_usagestats_parameters.json | 56 + .../oozie_app/config-default.xml | 38 + .../datasetsusagestats/oozie_app/workflow.xml | 70 + .../dhp-usage-raw-data-update/pom.xml | 79 ++ .../graph/usagerawdata/export/ConnectDB.java | 125 ++ .../usagerawdata/export/ExecuteWorkflow.java | 202 +++ .../graph/usagerawdata/export/IrusStats.java | 419 ++++++ .../export/LaReferenciaDownloadLogs.java | 265 ++++ .../export/LaReferenciaStats.java | 436 ++++++ .../export/PiwikDownloadLogs.java | 327 +++++ .../usagerawdata/export/PiwikStatsDB.java | 1262 +++++++++++++++++ .../export/ReadCounterRobotsList.java | 54 + .../graph/usagerawdata/export/SarcStats.java | 575 ++++++++ .../export/UsageStatsExporter.java | 186 +++ .../export/usagerawdata_parameters.json | 231 +++ .../usagerawdata/oozie_app/config-default.xml | 38 + .../graph/usagerawdata/oozie_app/workflow.xml | 90 ++ .../oa/graph/usagestats/export/IrusStats.java | 2 +- .../usagestats/export/LaReferenciaStats.java | 13 + .../usagestats/export/PiwikDownloadLogs.java | 470 +++--- .../graph/usagestats/export/PiwikStatsDB.java | 30 +- .../oa/graph/usagestats/export/SarcStats.java | 17 +- .../usagestats/export/UsageStatsExporter.java | 2 +- 28 files changed, 5420 insertions(+), 251 deletions(-) create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/pom.xml create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-usage-raw-data-update/pom.xml create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml new file mode 100644 index 000000000..14b543a57 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -0,0 +1,79 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-usage-datasets-stats-update + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-usage-datasets-stats-update + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java new file mode 100644 index 000000000..e6da7eff3 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -0,0 +1,202 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String datasetUsageStatsDBSchema; + private static String statsDBSchema; + private final static Logger logger = Logger.getLogger(ConnectDB.class); + private Statement stmt = null; + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getDataSetUsageStatsDBSchema() { + return ConnectDB.datasetUsageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + logger.info("Opened database successfully"); + + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + logger.info("Opened database successfully"); + return cpds.getConnection(); + + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Piwiklog table - This table should exist + String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLog); + + ///////////////////////////////////////// + // Rule for duplicate inserts @ piwiklog + ///////////////////////////////////////// + + String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log + ////////////////////////////////////////////////// + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } +} +/* +CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING, + name STRING, + source STRING, + release STRING, + createdby STRING, + report_end_date STRING, + report_start_date STRING) + CLUSTERED BY (reportid) + into 100 buckets stored as orc tblproperties('transactional'='true'); +*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java new file mode 100644 index 000000000..196238ea2 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java @@ -0,0 +1,97 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import com.google.gson.JsonObject; +import java.util.ArrayList; +import java.util.Iterator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * @author dpie + */ +public class DownloadReportsListFromDatacite { + + private String dataciteBaseURL; + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception { + + this.dataciteBaseURL = dataciteBaseURL; + this.dataciteReportPath = dataciteReportPath; + } + + public void downloadReportsList() throws ParseException { + StringBuilder responseStrBuilder = new StringBuilder(); + + Gson gson = new Gson(); + + try { + BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); + logger.info("Downloading from " + dataciteBaseURL); + + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder.append(inputStr); + } + } catch (IOException e) { + logger.info(e.getMessage()); + } + JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); + JsonArray dataArray = jsonObject.getAsJsonArray("reports"); + ArrayList reportsList = new ArrayList(); + for (JsonElement element : dataArray) { + reportsList.add(element.getAsJsonObject().get("id").getAsString()); + } + + Iterator it = reportsList.iterator(); + while (it.hasNext()) { + String reportId = it.next().toString(); + String url = dataciteBaseURL + reportId; + + try { + BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + StringBuilder responseStrBuilder2 = new StringBuilder(); + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder2.append(inputStr); + } + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"), + true); + byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + fin.close(); + + fin.close(); + } catch (IOException e) { + System.out.println(e); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java new file mode 100644 index 000000000..7b3db3115 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -0,0 +1,70 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String dataciteBaseURL; + static String dataciteReportPath; + static String dbHiveUrl; + static String dbImpalaUrl; + static String datasetUsageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + static boolean datasetsEmptyDirs; + static boolean finalTablesVisibleToImpala; + + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + dataciteBaseURL = parser.get("dataciteBaseURL"); + dataciteReportPath = parser.get("dataciteReportPath"); + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + recreateDbAndTables = true; + else + recreateDbAndTables = false; + + if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true")) + datasetsEmptyDirs = true; + else + datasetsEmptyDirs = false; + + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else + finalTablesVisibleToImpala = false; + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java new file mode 100644 index 000000000..28c4f30a1 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -0,0 +1,236 @@ +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + private Statement stmt = null; + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); + + logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); + + } + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + ConnectDB.getHiveConnection(); + + if (ExecuteWorkflow.recreateDbAndTables) { + createDatabase(); + createTables(); + reCreateLogDirs(); + } + logger.info("Initializing the download logs module"); + DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath); + + if (ExecuteWorkflow.datasetsEmptyDirs) { + logger.info("Downloading Reports List From Datacite"); + drfd.downloadReportsList(); + logger.info("Reports List has been downloaded"); + } + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Reports table - This table should exist + String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports(reportid STRING, \n" + + " name STRING, \n" + + " source STRING,\n" + + " release STRING,\n" + + " createdby STRING,\n" + + " report_end_date STRING,\n" + + " report_start_date STRING)\n" + + " CLUSTERED BY (reportid)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + + stmt.executeUpdate(sqlCreateTableDataciteeReports); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + +// runImpalaQuery(); +/* + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + + logger.info("Re-creating database and tables"); + + logger.info("Initializing the download logs module"); + PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + + if (ExecuteWorkflow.piwikEmptyDirs) { + logger.info("Recreating Piwik log directories"); + piwikstatsdb.reCreateLogDirs(); + } + + // Downloading piwik logs (also managing directory creation) + if (ExecuteWorkflow.downloadPiwikLogs) { + logger.info("Downloading piwik logs"); + piwd + .GetOpenAIRELogs( + ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); + } + logger.info("Downloaded piwik logs"); + + // Create DB tables, insert/update statistics + String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + piwikstatsdb.setCounterRobotsURL(cRobotsUrl); + + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } + + logger.info("Creating LaReferencia tables"); + LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + ExecuteWorkflow.lareferenciaAuthToken); + + if (ExecuteWorkflow.laReferenciaEmptyDirs) { + logger.info("Recreating LaReferencia log directories"); + lrf.reCreateLogDirs(); + } + + if (ExecuteWorkflow.downloadLaReferenciaLogs) { + logger.info("Downloading LaReferencia logs"); + lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); + logger.info("Downloaded LaReferencia logs"); + } + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } + + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); + if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { + logger.info("Creating Irus Stats tables"); + irusstats.createTables(); + logger.info("Created Irus Stats tables"); + + logger.info("Re-create log dirs"); + irusstats.reCreateLogDirs(); + logger.info("Re-created log dirs"); + } + + if (ExecuteWorkflow.irusDownloadReports) { + irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); + } + if (ExecuteWorkflow.irusProcessStats) { + irusstats.processIrusStats(); + logger.info("Irus done"); + } + + SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { + sarcStats.reCreateLogDirs(); + } + if (ExecuteWorkflow.sarcDownloadReports) { + sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + } + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + sarcStats.finalizeSarcStats(); + } + logger.info("Sarc done"); + + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + */ +} +/* + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } + */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json new file mode 100644 index 000000000..f8d51a882 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json @@ -0,0 +1,56 @@ +[ + { + "paramName": "dbu", + "paramLongName": "dataciteBaseURL", + "paramDescription": "URL of Datacite Reports Endpoint", + "paramRequired": true + }, + { + "paramName": "drp", + "paramLongName": "dataciteReportPath", + "paramDescription": "Path for Datacite Reports", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dusdbs", + "paramLongName": "datasetUsageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "datasetsEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the dataset_usage_stats, visible to Impala", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml new file mode 100644 index 000000000..3a81e497d --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml @@ -0,0 +1,70 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow + --dataciteBaseURL + ${dataciteBaseURL} + --dataciteReportPath + ${dataciteReportPath} + --dbHiveUrl + ${hiveJdbcUrl} + --dbImpalaUrl + ${impalaJdbcUrl} + --datasetUsageStatsDBSchema + ${datasetUsageStatsDBSchema} + --statsDBSchema + ${statsDBSchema} + --recreateDbAndTables + ${recreateDbAndTables} + --datasetsEmptyDirs + ${datasetsEmptyDirs} + --finalTablesVisibleToImpala + ${finalTablesVisibleToImpala} + + + + + + + + diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml new file mode 100644 index 000000000..338b2a2c5 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -0,0 +1,79 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-usage-raw-data-update + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-usage-raw-data-update + diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java new file mode 100644 index 000000000..f76644c83 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java @@ -0,0 +1,125 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageStatsDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getUsageStatsDBSchema() { + return ConnectDB.usageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + return cpds.getConnection(); + + } + +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java new file mode 100644 index 000000000..774dcf0b7 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -0,0 +1,202 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; + static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; + static String lareferenciaBaseURL; + static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + + static boolean piwikEmptyDirs; + static boolean downloadPiwikLogs; + static boolean processPiwikLogs; + + static Calendar startingLogPeriod; + static Calendar endingLogPeriod; + static int numberOfPiwikIdsToDownload; + static int numberOfSiteIdsToDownload; + + static boolean laReferenciaEmptyDirs; + static boolean downloadLaReferenciaLogs; + static boolean processLaReferenciaLogs; + + static boolean irusCreateTablesEmptyDirs; + static boolean irusDownloadReports; + static boolean irusProcessStats; + static int irusNumberOfOpendoarsToDownload; + + static boolean sarcCreateTablesEmptyDirs; + static boolean sarcDownloadReports; + static boolean sarcProcessStats; + static int sarcNumberOfIssnToDownload; + + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; + + static int numberOfDownloadThreads; + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + matomoAuthToken = parser.get("matomoAuthToken"); + matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); + irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); + lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); + lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + recreateDbAndTables = true; + else + recreateDbAndTables = false; + + if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) + piwikEmptyDirs = true; + else + piwikEmptyDirs = false; + + if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) + downloadPiwikLogs = true; + else + downloadPiwikLogs = false; +/* + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) + processPiwikLogs = true; + else + processPiwikLogs = false; +*/ + String startingLogPeriodStr = parser.get("startingLogPeriod"); + Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); + startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + + String endingLogPeriodStr = parser.get("endingLogPeriod"); + Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); + endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + + numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); + numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + + if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) + laReferenciaEmptyDirs = true; + else + laReferenciaEmptyDirs = false; + + if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) + downloadLaReferenciaLogs = true; + else + downloadLaReferenciaLogs = false; +/* + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) + processLaReferenciaLogs = true; + else + processLaReferenciaLogs = false; +*/ + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) + irusCreateTablesEmptyDirs = true; + else + irusCreateTablesEmptyDirs = false; + + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) + irusDownloadReports = true; + else + irusDownloadReports = false; +/* + if (parser.get("irusProcessStats").toLowerCase().equals("true")) + irusProcessStats = true; + else + irusProcessStats = false; + irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); +*/ + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) + sarcCreateTablesEmptyDirs = true; + else + sarcCreateTablesEmptyDirs = false; + + if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) + sarcDownloadReports = true; + else + sarcDownloadReports = false; +/* + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) + sarcProcessStats = true; + else + sarcProcessStats = false; + sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); +*/ +/* + if (parser.get("finalizeStats").toLowerCase().equals("true")) + finalizeStats = true; + else + finalizeStats = false; + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) + finalTablesVisibleToImpala = true; + else +*/ finalTablesVisibleToImpala = false; + + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + + private static Calendar startingLogPeriodStr(Date date) { + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; + + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java new file mode 100644 index 000000000..090f76ff5 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java @@ -0,0 +1,419 @@ +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class IrusStats { + + private String irusUKURL; + + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created +// createTmpTables(); + } + + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } + + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + // To see how to apply to the ignore duplicate rules and indexes +// stmt.executeUpdate(sqlCreateTableSushiLog); +// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO sushilog " +// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," +// + "sushilog.rid, sushilog.date " +// + "FROM sushilog " +// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlcreateRuleSushiLog); +// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; +// stmt.executeUpdate(createSushiIndex); + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + +// // The following may not be needed - It will be created when JSON tables are created +// private void createTmpTables() throws Exception { +// try { +// +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; +// stmt.executeUpdate(sqlCreateTableSushiLog); +// +// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); +// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; +// // stmt.executeUpdate(sqlCopyPublicSushiLog); +// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO sushilogtmp " +// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," +// + "sushilogtmp.rid, sushilogtmp.date " +// + "FROM sushilogtmp " +// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlcreateRuleSushiLog); +// +// stmt.close(); +// ConnectDB.getConnection().close(); +// log.info("Sushi Tmp Tables Created"); +// } catch (Exception e) { +// log.error("Failed to create tables: " + e); +// throw new Exception("Failed to create tables: " + e.toString(), e); +// } +// } + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); + + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); + + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); + + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); + + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); + + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); + + logger.info("Inserting into downloads_stats"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats"); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); + + ConnectDB.getHiveConnection().close(); + } + + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + + logger.info("(getIrusRRReport) Getting report: " + reportUrl); + + String text = getJson(reportUrl, "", ""); + + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } + + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 + && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } + + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } + + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + int batch_size = 0; + + if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); + } else { + while (start.before(end)) { + logger.info("date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); + + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } + + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + fin.close(); + } + + } + //ConnectDB.getHiveConnection().close(); + + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } + + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + System.out.println("response ====> " + response.toString()); + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java new file mode 100644 index 000000000..88550579b --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java @@ -0,0 +1,265 @@ +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); +// this.createTmpTables(); + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); +// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialog " +// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," +// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " +// + "FROM lareferencialog " +// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; +// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); +// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + +// private void createTmpTables() throws Exception { +// +// try { +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; +// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialogtmp " +// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," +// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " +// + "FROM lareferencialogtmp " +// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); +// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); +// +// stmt.close(); +// log.info("Lareferencia Tmp Tables Created"); +// +// } catch (Exception e) { +// log.error("Failed to create tmptables: " + e); +// throw new Exception("Failed to create tmp tables: " + e.toString(), e); +// // System.exit(0); +// } +// } + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; + + List siteIdsToVisit = new ArrayList(); + + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + + for (int siteId : siteIdsToVisit) { + logger.info("Now working on LaReferencia MatomoId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } + + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { + + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=?"); + st.setInt(1, laReferencialMatomoID); + Date dateMax = null; + + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); + } else { + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; + + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; + + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java new file mode 100644 index 000000000..ef7636099 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java @@ -0,0 +1,436 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaStats { + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); + + private String logRepoPath; + + private Statement stmt = null; + + private String CounterRobotsURL; + private ArrayList robotsList; + + public LaReferenciaStats(String logRepoPath) throws Exception { + this.logRepoPath = logRepoPath; + this.createTables(); +// this.createTmpTables(); + } + + /* + * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) { + * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } } + */ + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); +// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialog " +// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," +// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " +// + "FROM lareferencialog " +// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; +// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); +// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + +// private void createTmpTables() throws Exception { +// +// try { +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; +// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialogtmp " +// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," +// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " +// + "FROM lareferencialogtmp " +// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); +// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); +// +// stmt.close(); +// log.info("Lareferencia Tmp Tables Created"); +// +// } catch (Exception e) { +// log.error("Failed to create tmptables: " + e); +// throw new Exception("Failed to create tmp tables: " + e.toString(), e); +// // System.exit(0); +// } +// } + + public void processLogs() throws Exception { + try { + logger.info("Processing LaReferencia repository logs"); + processlaReferenciaLog(); + logger.info("LaReferencia repository logs process done"); + + logger.info("LaReferencia removing double clicks"); + removeDoubleClicks(); + logger.info("LaReferencia removed double clicks"); + + logger.info("LaReferencia creating viewsStats"); + viewsStats(); + logger.info("LaReferencia created viewsStats"); + logger.info("LaReferencia creating downloadsStats"); + downloadsStats(); + logger.info("LaReferencia created downloadsStats"); + logger.info("LaReferencia updating Production Tables"); + updateProdTables(); + logger.info("LaReferencia updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processlaReferenciaLog() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping lareferencialogtmp_json table"); + String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json"; + stmt.executeUpdate(drop_lareferencialogtmp_json); + logger.info("Dropped lareferencialogtmp_json table"); + + logger.info("Creating lareferencialogtmp_json"); + String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `repItem` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " timestamp: STRING,\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >,\n" + + " `2`: struct<\n" + + " `customVariablePageValue2`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + " >" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_lareferencialogtmp_json); + logger.info("Created lareferencialogtmp_json"); + + logger.info("Dropping lareferencialogtmp table"); + String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp"; + stmt.executeUpdate(drop_lareferencialogtmp); + logger.info("Dropped lareferencialogtmp table"); + + logger.info("Creating lareferencialogtmp"); + String create_lareferencialogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_lareferencialogtmp); + logger.info("Created lareferencialogtmp"); + + logger.info("Inserting into lareferencialogtmp"); + String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " + + "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " + + "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_lareferencialogtmp); + logger.info("Inserted into lareferencialogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " + + "AND p1.timestamp listHdfsDir(String dir) throws Exception { + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + // log.info("Found hdfs file " + fileName); + fileNames.add(fileName); + } + // hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java new file mode 100644 index 000000000..681105de4 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -0,0 +1,327 @@ +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.Authenticator; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + + } + + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } + + class WorkerThread implements Runnable { + + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; + + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } + + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } + } + + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); + //while (rs.next()) + //piwikIdToVisit.add(rs.getInt(1)); + piwikIdToVisit.add(13); + + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + logger.info("Now working on piwikId: " + siteId); + + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); + + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } + + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java new file mode 100644 index 000000000..e0225d49a --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -0,0 +1,1262 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + // The piwiklog table is not needed since it is built + // on top of JSON files + this.createTmpTables(); + } + + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Piwiklog table - This table should exist + String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLog); + + ///////////////////////////////////////// + // Rule for duplicate inserts @ piwiklog + ///////////////////////////////////////// + + String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log + ////////////////////////////////////////////////// + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private void createTmpTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTmpTablePiwikLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ piwiklogtmp + ////////////////////////////////////////////////// + + ////////////////////////////////////////////////// + // Copy from public.piwiklog to piwiklog + ////////////////////////////////////////////////// + // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; + // stmt.executeUpdate(sqlCopyPublicPiwiklog); + + String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTmpTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log_tmp + ////////////////////////////////////////////////// + + stmt.close(); + + } catch (Exception e) { + logger.error("Failed to create tmptables: " + e); + throw new Exception("Failed to create tmp tables: " + e.toString(), e); + // System.exit(0); + } + } + + public void processLogs() throws Exception { + try { + ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); + this.robotsList = counterRobots.getRobotsPatterns(); + + logger.info("Processing repository logs"); + processRepositoryLog(); + logger.info("Repository logs process done"); + + logger.info("Removing double clicks"); + removeDoubleClicks(); + logger.info("Removing double clicks done"); + + logger.info("Cleaning oai"); + cleanOAI(); + logger.info("Cleaning oai done"); + + logger.info("Processing portal logs"); + processPortalLog(); + logger.info("Portal logs process done"); + + logger.info("Processing portal usagestats"); + portalStats(); + logger.info("Portal usagestats process done"); + + logger.info("ViewsStats processing starts"); + viewsStats(); + logger.info("ViewsStats processing ends"); + + logger.info("DownloadsStats processing starts"); + downloadsStats(); + logger.info("DownloadsStats processing starts"); + + logger.info("Updating Production Tables"); + updateProdTables(); + logger.info("Updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processRepositoryLog() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping piwiklogtmp_json table"); + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; + stmt.executeUpdate(drop_piwiklogtmp_json); + logger.info("Dropped piwiklogtmp_json table"); + + logger.info("Creating piwiklogtmp_json"); + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >\n" + + " >,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_piwiklogtmp_json); + logger.info("Created piwiklogtmp_json"); + + logger.info("Dropping piwiklogtmp table"); + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; + stmt.executeUpdate(drop_piwiklogtmp); + logger.info("Dropped piwiklogtmp"); + + logger.info("Creating piwiklogtmp"); + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_piwiklogtmp); + logger.info("Created piwiklogtmp"); + + logger.info("Inserting into piwiklogtmp"); + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_piwiklogtmp); + logger.info("Inserted into piwiklogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "WHERE EXISTS (\n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_process_portal_log_tmp_json); + logger.info("Created process_portal_log_tmp_json"); + + logger.info("Droping process_portal_log_tmp table"); + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; + stmt.executeUpdate(drop_process_portal_log_tmp); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Creating process_portal_log_tmp"); + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_process_portal_log_tmp); + logger.info("Created process_portal_log_tmp"); + + logger.info("Inserting into process_portal_log_tmp"); + String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_process_portal_log_tmp); + logger.info("Inserted into process_portal_log_tmp"); + + stmt.close(); + } + + public void portalStats() throws SQLException { + Connection con = ConnectDB.getHiveConnection(); + Statement stmt = con.createStatement(); + con.setAutoCommit(false); + +// Original queries where of the style +// +// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp2, +// openaire_prod_stats_20200821.result_oids roid +// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null +// +// The following query is an example of how queries should be +// +// +// INSERT INTO usagestats_20200907.piwiklogtmp +// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp +// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id +// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); +// +// We should consider if we would like the queries to be as the following +// +// INSERT INTO usagestats_20200907.piwiklogtmp +// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent +// FROM usagestats_20200907.process_portal_log_tmp +// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id +// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND +// roid.oid != ''); + + logger.info("PortalStats - Step 1"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".result_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("PortalStats - Step 2"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + /* + * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " + + * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent " + * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close(); + */ + logger.info("PortalStats - Step 3"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".project_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + con.close(); + } + + private void cleanOAI() throws Exception { + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning oai - Step 1"); + stmt = ConnectDB.getHiveConnection().createStatement(); + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 2"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 3"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 4"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 5"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 6"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 7"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 8"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 9"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 10"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 11"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 12"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 13"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 14"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 15"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 16"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 17"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 18"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 19"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 20"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 21"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 22"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 23"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 24"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 25"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 26"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 27"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 28"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 29"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Done, closing connection"); + ConnectDB.getHiveConnection().close(); + } + + private String processPortalURL(String url) { + + if (url.indexOf("explore.openaire.eu") > 0) { + try { + url = URLDecoder.decode(url, "UTF-8"); + } catch (Exception e) { + logger.info("Error when decoding the following URL: " + url); + } + if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); + } else if (url.indexOf("datasource=") > 0 + && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { + url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); + } else if (url.indexOf("datasourceFilter=") > 0 + && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); + } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); + } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); + } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 + && !url.contains("oai:dnet:corda")) { + url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); + } else if (url.indexOf("organizationId=") > 0 + && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { + url = "organization|" + + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); + } else { + url = ""; + } + } else { + url = ""; + } + + return url; + } + + private void updateProdTables() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Inserting data to piwiklog"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Inserting data to pageviews_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Creating usage_stats table"); + String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + + "AS SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(createUsageStats); + logger.info("Created usage_stats table"); + + + /* + * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " + + * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql); + * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " + + * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql); + * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " + + * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql); + * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " + + * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); + */ + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java new file mode 100644 index 000000000..6f020daa0 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java @@ -0,0 +1,54 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; + +import org.json.JSONException; +import org.json.simple.JSONArray; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class ReadCounterRobotsList { + + private ArrayList robotsPatterns = new ArrayList(); + private String COUNTER_ROBOTS_URL; + + public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { + COUNTER_ROBOTS_URL = url; + robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); + } + + private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { + InputStream is = new URL(url).openStream(); + JSONParser parser = new JSONParser(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); + JSONArray jsonArray = (JSONArray) parser.parse(reader); + for (Object aJsonArray : jsonArray) { + org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; + robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); + } + return robotsPatterns; + } + + public ArrayList getRobotsPatterns() { + return robotsPatterns; + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java new file mode 100644 index 000000000..54ed286cb --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java @@ -0,0 +1,575 @@ +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class SarcStats { + + private Statement stmtHive = null; + private Statement stmtImpala = null; + + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + + public SarcStats() throws Exception { +// createTables(); + } + + private void createTables() throws Exception { + try { + + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void reCreateLogDirs() throws IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); + + logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); + + logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); + + logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); + } + + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sarc_sushilogtmp_json_array table"); + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); + logger.info("Dropped sarc_sushilogtmp_json_array table"); + + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + + ConnectDB.getHiveConnection().close(); + } + + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); + + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[]{ + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[]{ + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); + + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 + && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } + + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } + + } + + public void finalizeSarcStats() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); + + logger.info("Creating downloads_stats table_tmp"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmtHive.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats_tmp table"); + + logger.info("Dropping sarc_sushilogtmp_impala table"); + String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala"; + stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); + logger.info("Dropped sarc_sushilogtmp_impala table"); + + logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); + String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala " + + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(createSarcSushilogtmpImpala); + logger.info("Created sarc_sushilogtmp_impala"); + + logger.info("Making sarc_sushilogtmp visible to impala"); + String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_impala;"; + stmtImpala.executeUpdate(invalidateMetadata); + + logger.info("Dropping downloads_stats_impala table"); + String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala"; + stmtHive.executeUpdate(drop_downloads_stats_impala); + logger.info("Dropped downloads_stats_impala table"); + + logger.info("Making downloads_stats_impala deletion visible to impala"); + try { + String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala;"; + stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); + } catch (SQLException sqle) { + } + + // We run the following query in Impala because it is faster + logger.info("Creating downloads_stats_impala"); + String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_impala AS " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + stmtImpala.executeUpdate(createDownloadsStatsImpala); + logger.info("Creating downloads_stats_impala"); + + // Insert into downloads_stats + logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats_tmp SELECT * " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; + stmtHive.executeUpdate(insertDStats); + logger.info("Inserted into downloads_stats_tmp"); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository_id` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmtHive.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } + + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); + + if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); + } else { + + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); + + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } + + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + + for (Object aJsonArray : jsonArray) { + + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); + + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } + + finArray.close(); + finNonArray.close(); + + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) + fileArray.delete(); + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) + fileNonArray.delete(); + + } + + dfs.close(); + } + //ConnectDB.getHiveConnection().close(); + } + + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jjval); + } else if (jjval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jjval); + } // All other types of vals + else + ; + } + } + + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { + + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; + + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); + + if (jval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jval); + } + + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } + + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); +// return null; +// throw new Exception("Failed to get URL: " + e.toString(), e); + } + return ""; + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java new file mode 100644 index 000000000..c4ee7d63c --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -0,0 +1,186 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + +// runImpalaQuery(); + + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) + piwikstatsdb.recreateDBAndTables(); + ; + + logger.info("Initializing the download logs module"); + PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + + if (ExecuteWorkflow.piwikEmptyDirs) { + logger.info("Recreating Piwik log directories"); + piwikstatsdb.reCreateLogDirs(); + } + + // Downloading piwik logs (also managing directory creation) + if (ExecuteWorkflow.downloadPiwikLogs) { + logger.info("Downloading piwik logs"); + piwd + .GetOpenAIRELogs( + ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); + } + logger.info("Downloaded piwik logs"); +/* + // Create DB tables, insert/update statistics + String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + piwikstatsdb.setCounterRobotsURL(cRobotsUrl); + + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } +*/ + logger.info("Creating LaReferencia tables"); + LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + ExecuteWorkflow.lareferenciaAuthToken); + + if (ExecuteWorkflow.laReferenciaEmptyDirs) { + logger.info("Recreating LaReferencia log directories"); + lrf.reCreateLogDirs(); + } + + if (ExecuteWorkflow.downloadLaReferenciaLogs) { + logger.info("Downloading LaReferencia logs"); + lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); + logger.info("Downloaded LaReferencia logs"); + } +/* + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } +*/ + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); + if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { + logger.info("Creating Irus Stats tables"); + irusstats.createTables(); + logger.info("Created Irus Stats tables"); + + logger.info("Re-create log dirs"); + irusstats.reCreateLogDirs(); + logger.info("Re-created log dirs"); + } + + if (ExecuteWorkflow.irusDownloadReports) { + irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); + } +/* + if (ExecuteWorkflow.irusProcessStats) { + irusstats.processIrusStats(); + logger.info("Irus done"); + } +*/ + SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { + sarcStats.reCreateLogDirs(); + } + if (ExecuteWorkflow.sarcDownloadReports) { + sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + } +/* + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + sarcStats.finalizeSarcStats(); + } + logger.info("Sarc done"); +*/ + +/* + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } +*/ + +/* + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } +*/ + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json new file mode 100644 index 000000000..988c23b48 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -0,0 +1,231 @@ +[ + { + "paramName": "mat", + "paramLongName": "matomoAuthToken", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "mbu", + "paramLongName": "matomoBaseURL", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukbuw", + "paramLongName": "irusUKBaseURL", + "paramDescription": "working directory", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lbu", + "paramLongName": "lareferenciaBaseURL", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lat", + "paramLongName": "lareferenciaAuthToken", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "piwikEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dpwl", + "paramLongName": "downloadPiwikLogs", + "paramDescription": "download piwik logs?", + "paramRequired": true + }, + { + "paramName": "slp", + "paramLongName": "startingLogPeriod", + "paramDescription": "Starting log period", + "paramRequired": true + }, + { + "paramName": "elp", + "paramLongName": "endingLogPeriod", + "paramDescription": "Ending log period", + "paramRequired": true + }, + { + "paramName": "npidd", + "paramLongName": "numberOfPiwikIdsToDownload", + "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", + "paramRequired": true + }, + { + "paramName": "nsidd", + "paramLongName": "numberOfSiteIdsToDownload", + "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", + "paramRequired": true + }, + { + "paramName": "lerd", + "paramLongName": "laReferenciaEmptyDirs", + "paramDescription": "Empty LaReferencia directories?", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dlrl", + "paramLongName": "downloadLaReferenciaLogs", + "paramDescription": "download La Referencia logs?", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "irusCreateTablesEmptyDirs", + "paramDescription": "Irus section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "irusDownloadReports", + "paramDescription": "Irus section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "irusNumberOfOpendoarsToDownload", + "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "sarcCreateTablesEmptyDirs", + "paramDescription": "Sarc section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "sarcDownloadReports", + "paramDescription": "Sarc section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "sarcNumberOfIssnToDownload", + "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", + "paramRequired": true + }, + + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml new file mode 100644 index 000000000..a6600516d --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -0,0 +1,90 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.usagerawdata.export.ExecuteWorkflow + --matomoAuthToken${matomoAuthToken} + --matomoBaseURL${matomoBaseURL} + --repoLogPath${repoLogPath} + --portalLogPath${portalLogPath} + --portalMatomoID${portalMatomoID} + --irusUKBaseURL${irusUKBaseURL} + --irusUKReportPath${irusUKReportPath} + --sarcsReportPathArray${sarcsReportPathArray} + --sarcsReportPathNonArray${sarcsReportPathNonArray} + --lareferenciaLogPath${lareferenciaLogPath} + --lareferenciaBaseURL${lareferenciaBaseURL} + --lareferenciaAuthToken${lareferenciaAuthToken} + --dbHiveUrl${hiveJdbcUrl} + --dbImpalaUrl${impalaJdbcUrl} + --usageStatsDBSchema${usageStatsDBSchema} + --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} + --piwikEmptyDirs${piwikEmptyDirs} + --downloadPiwikLogs${downloadPiwikLogs} + --processPiwikLogs${processPiwikLogs} + --startingLogPeriod${startingLogPeriod} + --endingLogPeriod${endingLogPeriod} + --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} + --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} + --laReferenciaEmptyDirs${laReferenciaEmptyDirs} + --downloadLaReferenciaLogs${downloadLaReferenciaLogs} + --processLaReferenciaLogs${processLaReferenciaLogs} + --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} + --irusDownloadReports${irusDownloadReports} + --irusProcessStats${irusProcessStats} + --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} + --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} + --sarcDownloadReports${sarcDownloadReports} + --sarcProcessStats${sarcProcessStats} + --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} + --finalizeStats${finalizeStats} + --finalTablesVisibleToImpala${finalTablesVisibleToImpala} + --numberOfDownloadThreads${numberOfDownloadThreads} + + + + + + + + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java index 6947381c9..011c90532 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java @@ -174,7 +174,7 @@ public class IrusStats { + "WHERE `ItemIdent`.`Type`= 'OAI'"; stmt.executeUpdate(insertSushilogtmp); logger.info("Inserted to irus_sushilogtmp table"); - + logger.info("Creating downloads_stats table"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java index 347d3de21..747b5ce0e 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java @@ -357,6 +357,19 @@ public class LaReferenciaStats { "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; stmt.executeUpdate(sql); + logger.info("Inserting data to usage_stats from lareferencia"); + sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + + "SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats from lareferencia"); // sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;"; // stmt.executeUpdate(sql); diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java index 7a64e48d2..8f7fffa9f 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java @@ -1,4 +1,3 @@ - package eu.dnetlib.oa.graph.usagestats.export; import java.io.*; @@ -31,293 +30,296 @@ import org.slf4j.LoggerFactory; */ public class PiwikDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; - private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); - public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; - } + } - private String getPiwikLogUrl() { - return "https://" + piwikUrl + "/"; - } + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } - private String getJson(String url) throws Exception { - try { - logger.debug("Connecting to download the JSON: " + url); - URL website = new URL(url); - URLConnection connection = website.openConnection(); + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + url + " Exception: " + e); - throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); - } - } + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } - class WorkerThread implements Runnable { - private Calendar currDay; - private int siteId; - private String repoLogsPath; - private String portalLogPath; - private String portalMatomoID; + class WorkerThread implements Runnable { - public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws IOException { - this.currDay = (Calendar) currDay.clone(); - this.siteId = new Integer(siteId); - this.repoLogsPath = new String(repoLogsPath); - this.portalLogPath = new String(portalLogPath); - this.portalMatomoID = new String(portalMatomoID); - } + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; - public void run() { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - System.out - .println( - Thread.currentThread().getName() + " (Start) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + - ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + - ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - try { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - System.out - .println( - Thread.currentThread().getName() + " (End) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + - ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + - ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - } + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - int i = 0; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + int i = 0; - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) - break; + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - writtenBytes += jsonObjectRawBytes.length + 1; - } + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + writtenBytes += jsonObjectRawBytes.length + 1; + } - i++; - } while (true); + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - fs.close(); - } - } + i++; + } while (true); - public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + fs.close(); + } + } - Statement statement = ConnectDB.getHiveConnection().createStatement(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { - ResultSet rs = statement - .executeQuery( - "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Getting all the piwikids in a list for logging reasons & limitting the list - // to the max number of piwikids - List piwikIdToVisit = new ArrayList(); + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); while (rs.next()) piwikIdToVisit.add(rs.getInt(1)); - logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && - ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { - logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + logger.info("Now working on piwikId: " + siteId); - logger.info("Now working on piwikId: " + siteId); + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); - PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog WHERE source=?"); - st.setInt(1, siteId); - Date dateMax=null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - // logger.info("Date used " + currDay.toString()); - // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - // executor.execute(worker);// calling execute method of ExecutorService - logger.info("Date used " + currDay.getTime().toString()); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } - if(dateMax!=null && currDay.getTime().compareTo(dateMax)<=0) - logger.info("Date found in logs "+dateMax+ " and not downloanding Matomo logs for "+siteId); - else - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } - } - } - // executor.shutdown(); - // while (!executor.isTerminated()) { - // } - // System.out.println("Finished all threads"); - } + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + int i = 0; - int i = 0; + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) - break; + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + writtenBytes += jsonObjectRawBytes.length + 1; + } - writtenBytes += jsonObjectRawBytes.length + 1; - } + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + i++; + } while (true); - i++; - } while (true); - - fs.close(); - } + fs.close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java index 6625c381b..6d5bdfac0 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -551,17 +551,20 @@ public class PiwikStatsDB { stmt.executeUpdate(createViewsStats); logger.info("Created views_stats table"); - String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "AS SELECT coalesce(ds.source, vs.source) as source, " + + logger.info("Inserting data to usage_stats"); + sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + + "SELECT coalesce(ds.source, vs.source) as source, " + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + "coalesce(ds.openaire, 0) as openaire_downloads, " + "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp AS vs ON ds.source=vs.source " + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(createUsageStats); + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats"); + stmt.close(); ConnectDB.getHiveConnection().close(); @@ -1167,7 +1170,22 @@ public class PiwikStatsDB { "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql); - /* + logger.info("Creating usage_stats table"); + String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + + "AS SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(createUsageStats); + logger.info("Created usage_stats table"); + + + /* * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " + * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql); * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java index 71dfc6f61..06e350c9e 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java @@ -286,9 +286,9 @@ public class SarcStats { ConnectDB.getHiveConnection().setAutoCommit(false); stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - logger.info("Creating downloads_stats table"); + logger.info("Creating downloads_stats table_tmp"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " + + ".downloads_stats_tmp " + "(`source` string, " + "`repository_id` string, " + "`result_id` string, " @@ -296,7 +296,7 @@ public class SarcStats { + "`count` bigint, " + "`openaire` bigint)"; stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); + logger.info("Created downloads_stats_tmp table"); logger.info("Dropping sarc_sushilogtmp_impala table"); String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " @@ -341,20 +341,19 @@ public class SarcStats { + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".datasource_results dr, " + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND dr.id=d.id AND dr.result=ro.id AND " - + "s.rid=ro.pid AND ro.type='Digital Object Identifier' AND metric_type='ft_total' AND s.source='SARC-OJS'"; + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; stmtImpala.executeUpdate(createDownloadsStatsImpala); logger.info("Creating downloads_stats_impala"); // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats"); + logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats SELECT * " + + ".downloads_stats_tmp SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); + logger.info("Inserted into downloads_stats_tmp"); logger.info("Creating sushilog table"); String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index ae901dfa5..405b58bd5 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -173,7 +173,7 @@ public class UsageStatsExporter { sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); - stmt.close(); + stmt.close(); ConnectDB.getHiveConnection().close(); } } From bbcf6b7c8b0e29ae52371f4c5694dc10f5d53c7a Mon Sep 17 00:00:00 2001 From: Dimitris Date: Tue, 17 Nov 2020 08:36:51 +0200 Subject: [PATCH 04/16] Commit 17112020 --- .../usagerawdata/export/ExecuteWorkflow.java | 22 +- .../graph/usagerawdata/export/IrusStats.java | 4 +- .../export/LaReferenciaStats.java | 12 +- .../export/PiwikDownloadLogs.java | 1 + .../usagerawdata/export/PiwikStatsDB.java | 425 +----------------- .../graph/usagerawdata/export/SarcStats.java | 4 +- .../export/UsageStatsExporter.java | 32 +- dhp-workflows/dhp-usage-stats-build/pom.xml | 79 ++++ .../usagestatsbuild/export/ConnectDB.java | 130 ++++++ .../export/ExecuteWorkflow.java | 172 +++++++ .../usagestatsbuild/export/IrusStats.java | 71 +++ .../export/LaReferenciaStats.java | 149 ++++++ .../usagestatsbuild/export/PiwikStatsDB.java | 345 ++++++++++++++ .../usagestatsbuild/export/SarcStats.java | 106 +++++ .../export/UsageStatsExporter.java | 106 +++++ .../export/usagestatsbuild_parameters.json | 237 ++++++++++ .../oozie_app/config-default.xml | 38 ++ .../usagestatsbuild/oozie_app/workflow.xml | 91 ++++ 18 files changed, 1589 insertions(+), 435 deletions(-) create mode 100644 dhp-workflows/dhp-usage-stats-build/pom.xml create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java index 774dcf0b7..81e34b3e7 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -112,13 +112,13 @@ public class ExecuteWorkflow { downloadPiwikLogs = true; else downloadPiwikLogs = false; -/* + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) processPiwikLogs = true; else processPiwikLogs = false; -*/ - String startingLogPeriodStr = parser.get("startingLogPeriod"); + + String startingLogPeriodStr = parser.get("startingLogPeriod"); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); @@ -138,12 +138,12 @@ public class ExecuteWorkflow { downloadLaReferenciaLogs = true; else downloadLaReferenciaLogs = false; -/* + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) processLaReferenciaLogs = true; else processLaReferenciaLogs = false; -*/ + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) irusCreateTablesEmptyDirs = true; else @@ -153,13 +153,13 @@ public class ExecuteWorkflow { irusDownloadReports = true; else irusDownloadReports = false; -/* + if (parser.get("irusProcessStats").toLowerCase().equals("true")) irusProcessStats = true; else irusProcessStats = false; irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); -*/ + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) sarcCreateTablesEmptyDirs = true; else @@ -169,13 +169,13 @@ public class ExecuteWorkflow { sarcDownloadReports = true; else sarcDownloadReports = false; -/* + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) sarcProcessStats = true; else sarcProcessStats = false; sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); -*/ + /* if (parser.get("finalizeStats").toLowerCase().equals("true")) finalizeStats = true; @@ -184,8 +184,8 @@ public class ExecuteWorkflow { if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) finalTablesVisibleToImpala = true; else -*/ finalTablesVisibleToImpala = false; - + finalTablesVisibleToImpala = false; +*/ numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); UsageStatsExporter usagestatsExport = new UsageStatsExporter(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java index 090f76ff5..bb8d8565e 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java @@ -174,7 +174,7 @@ public class IrusStats { + "WHERE `ItemIdent`.`Type`= 'OAI'"; stmt.executeUpdate(insertSushilogtmp); logger.info("Inserted to irus_sushilogtmp table"); - +/* logger.info("Creating downloads_stats table"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " @@ -209,7 +209,7 @@ public class IrusStats { + "`count` int)"; stmt.executeUpdate(createSushilog); logger.info("Created sushilog table"); - +*/ logger.info("Inserting to sushilog table"); String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java index ef7636099..c20781767 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java @@ -116,13 +116,16 @@ public class LaReferenciaStats { removeDoubleClicks(); logger.info("LaReferencia removed double clicks"); +/******** logger.info("LaReferencia creating viewsStats"); viewsStats(); logger.info("LaReferencia created viewsStats"); logger.info("LaReferencia creating downloadsStats"); downloadsStats(); logger.info("LaReferencia created downloadsStats"); - logger.info("LaReferencia updating Production Tables"); + +************/ + logger.info("LaReferencia updating Production Tables"); updateProdTables(); logger.info("LaReferencia updated Production Tables"); @@ -343,7 +346,7 @@ public class LaReferenciaStats { String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " + "select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; stmt.executeUpdate(sql); - +/***** logger.info("Updating views_stats"); sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; @@ -372,6 +375,11 @@ public class LaReferenciaStats { logger.info("Inserted data to usage_stats from lareferencia"); // sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;"; // stmt.executeUpdate(sql); +****/ + logger.info("Dropping lareferencialogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; + logger.info("Dropped lareferencialogtmp"); + stmt.executeUpdate(sql); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index 681105de4..5cc9ec563 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -196,6 +196,7 @@ public class PiwikDownloadLogs { //while (rs.next()) //piwikIdToVisit.add(rs.getInt(1)); piwikIdToVisit.add(13); + piwikIdToVisit.add(109); logger.info("Found the following piwikIds for download: " + piwikIdToVisit); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java index e0225d49a..4903d9599 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -60,7 +60,7 @@ public class PiwikStatsDB { this.createTables(); // The piwiklog table is not needed since it is built // on top of JSON files - this.createTmpTables(); + ////////////this.createTmpTables(); } public ArrayList getRobotsList() { @@ -141,7 +141,7 @@ public class PiwikStatsDB { } } - private void createTmpTables() throws Exception { +/***** public void createTmpTables() throws Exception { try { Statement stmt = ConnectDB.getHiveConnection().createStatement(); String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " @@ -181,7 +181,7 @@ public class PiwikStatsDB { // System.exit(0); } } - +******/ public void processLogs() throws Exception { try { ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); @@ -203,9 +203,10 @@ public class PiwikStatsDB { processPortalLog(); logger.info("Portal logs process done"); - logger.info("Processing portal usagestats"); + logger.info("Processing portal usagestats"); portalStats(); logger.info("Portal usagestats process done"); +/***** logger.info("ViewsStats processing starts"); viewsStats(); @@ -214,11 +215,12 @@ public class PiwikStatsDB { logger.info("DownloadsStats processing starts"); downloadsStats(); logger.info("DownloadsStats processing starts"); - +*****/ logger.info("Updating Production Tables"); updateProdTables(); logger.info("Updated Production Tables"); + } catch (Exception e) { logger.error("Failed to process logs: " + e); throw new Exception("Failed to process logs: " + e.toString(), e); @@ -338,369 +340,6 @@ public class PiwikStatsDB { stmt.close(); } - public void viewsStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Dropping result_views_monthly_tmp table"); - String drop_result_views_monthly_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".result_views_monthly_tmp"; - stmt.executeUpdate(drop_result_views_monthly_tmp); - logger.info("Dropped result_views_monthly_tmp table"); - - logger.info("Creating result_views_monthly_tmp table"); - String create_result_views_monthly_tmp = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".result_views_monthly_tmp " + - "AS SELECT entity_id AS id, " + - "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + - "AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklogtmp where action='action' and (source_item_type='oaItem' or " + - "source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; - stmt.executeUpdate(create_result_views_monthly_tmp); - logger.info("Created result_views_monthly_tmp table"); - - logger.info("Dropping views_stats_tmp table"); - String drop_views_stats_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".views_stats_tmp"; - stmt.executeUpdate(drop_views_stats_tmp); - logger.info("Dropped views_stats_tmp table"); - - logger.info("Creating views_stats_tmp table"); - String create_views_stats_tmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".views_stats_tmp " + - "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(create_views_stats_tmp); - logger.info("Created views_stats_tmp table"); -/* - logger.info("Dropping views_stats table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".views_stats"; - stmt.executeUpdate(drop_views_stats); - logger.info("Dropped views_stats table"); -*/ - logger.info("Creating views_stats table"); - String create_view_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp STORED AS PARQUET"; - stmt.executeUpdate(create_view_stats); - logger.info("Created views_stats table"); - - logger.info("Dropping pageviews_stats_tmp table"); - String drop_pageviews_stats_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".pageviews_stats_tmp"; - stmt.executeUpdate(drop_pageviews_stats_tmp); - logger.info("Dropped pageviews_stats_tmp table"); - - logger.info("Creating pageviews_stats_tmp table"); - String create_pageviews_stats_tmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".pageviews_stats_tmp AS SELECT " + - "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(create_pageviews_stats_tmp); - logger.info("Created pageviews_stats_tmp table"); - -/* logger.info("Droping pageviews_stats table"); - String drop_pageviews_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".pageviews_stats"; - stmt.executeUpdate(drop_pageviews_stats); - logger.info("Dropped pageviews_stats table"); -*/ - logger.info("Creating pageviews_stats table"); - String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".pageviews_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp STORED AS PARQUET"; - stmt.executeUpdate(create_pageviews_stats); - logger.info("Created pageviews_stats table"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - - private void downloadsStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Dropping result_downloads_monthly_tmp view"); - String drop_result_downloads_monthly_tmp = "DROP VIEW IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".result_downloads_monthly_tmp"; - stmt.executeUpdate(drop_result_downloads_monthly_tmp); - logger.info("Dropped result_downloads_monthly_tmp view"); - - logger.info("Creating result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".result_downloads_monthly_tmp " + - "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + - "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp where action='download' " + - "AND (source_item_type='oaItem' OR source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + - "ORDER BY source, entity_id, month"; - stmt.executeUpdate(sql); - logger.info("Created result_downloads_monthly_tmp view"); - - logger.info("Dropping downloads_stats_tmp table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats_tmp"; - stmt.executeUpdate(drop_views_stats); - logger.info("Dropped downloads_stats_tmp table"); - - logger.info("Creating downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS " + - "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id and p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(sql); - logger.info("Created downloads_stats_tmp table"); -/* - logger.info("Dropping downloads_stats table"); - String drop_downloads_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats"; - stmt.executeUpdate(drop_downloads_stats); - logger.info("Dropped downloads_stats table"); -*/ - logger.info("Creating downloads_stats table"); - String create_downloads_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp STORED AS PARQUET "; - stmt.executeUpdate(create_downloads_stats); - logger.info("Created downloads_stats table"); - - logger.info("Dropping result_downloads_monthly_tmp view"); - sql = "DROP VIEW IF EXISTS result_downloads_monthly_tmp"; - logger.info("Dropped result_downloads_monthly_tmp view"); - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - - public void finalizeStats() throws Exception { - stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Dropping full_dates table"); - String dropFullDates = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".full_dates"; - stmt.executeUpdate(dropFullDates); - logger.info("Dropped full_dates table"); - - Calendar startCalendar = Calendar.getInstance(); - startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); - Calendar endCalendar = Calendar.getInstance(); - int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); - int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); - - logger.info("Creating full_dates table"); - String sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + - "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + - "FROM (SELECT DATE '2016-01-01' AS from_date) p " + - "LATERAL VIEW " + - "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; - stmt.executeUpdate(sql); - logger.info("Created full_dates table"); - - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats " + - "(`source` string, " + - "`repository_id` string, " + - "`result_id` string, " + - "`date` string, " + - "`count` bigint, " + - "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); - - logger.info("Creating views_stats table"); - String createViewsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".views_stats " + - "(`source` string, " + - "`repository_id` string, " + - "`result_id` string, " + - "`date` string, " + - "`count` bigint, " + - "`openaire` bigint)"; - stmt.executeUpdate(createViewsStats); - logger.info("Created views_stats table"); - - logger.info("Inserting data to usage_stats"); - sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(sql); - logger.info("Inserted data to usage_stats"); - - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - - // Create repository Views statistics - private void repositoryViewsStats() throws Exception { - stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - -// String sql = "SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source INTO repo_view_stats FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; - String sql = "CREATE TABLE IF NOT EXISTS repo_view_stats AS SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_id ON repo_view_stats USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_date ON repo_view_stats USING btree(date)"; - stmt.executeUpdate(sql); - -// sql = "SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_view_stats_monthly_clean FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;"; - sql = "CREATE TABLE IF NOT EXISTS repo_view_stats_monthly_clean AS SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_clean_id ON repo_view_stats_monthly_clean USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_clean_month ON repo_view_stats_monthly_clean USING btree(month)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_clean_source ON repo_view_stats_monthly_clean USING btree(source)"; - stmt.executeUpdate(sql); - - Calendar startCalendar = Calendar.getInstance(); - startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); - Calendar endCalendar = Calendar.getInstance(); - int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); - int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); - - // sql="CREATE OR REPLACE view repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is - // null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', - // ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, - // " + diffMonth +", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source - // from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order - // by d.id, d.new_date"; -// sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO repo_view_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; - sql = "CREATE TABLE IF NOT EXISTS repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " - + diffMonth - + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_id ON repo_view_stats_monthly USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_month ON repo_view_stats_monthly USING btree(month)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_view_stats_monthly_source ON repo_view_stats_monthly USING btree(source)"; - stmt.executeUpdate(sql); - - sql = "CREATE OR REPLACE view repo_view_stats_monthly_sushi AS SELECT id, sum(number_of_views), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_view_stats group by id, month, source;"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().commit(); - ConnectDB.getHiveConnection().close(); - } - - // Create repository downloads statistics - private void repositoryDownloadsStats() throws Exception { - stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - -// String sql = "SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source INTO repo_download_stats FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; - String sql = "CREATE TABLE IF NOT EXISTS repo_download_stats AS SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_id ON repo_download_stats USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_date ON repo_download_stats USING btree(date)"; - stmt.executeUpdate(sql); - -// sql = "SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_download_stats_monthly_clean FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;"; - sql = "CREATE TABLE IF NOT EXISTS repo_download_stats_monthly_clean AS SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_clean_id ON repo_download_stats_monthly_clean USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_clean_month ON repo_download_stats_monthly_clean USING btree(month)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_clean_source ON repo_download_stats_monthly_clean USING btree(source)"; - stmt.executeUpdate(sql); - - Calendar startCalendar = Calendar.getInstance(); - startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); - Calendar endCalendar = Calendar.getInstance(); - int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); - int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); - - // sql="CREATE OR REPLACE view repo_download_stats_monthly AS select d.id, d.new_date AS month, case when - // rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', - // ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, - // " + diffMonth +", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, - // source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and - // d.source=rdm.source order by d.id, d.new_date"; - // sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO - // repo_download_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + - // interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) - // AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from - // repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order - // by d.id, d.new_date"; - sql = "CREATE TABLE IF NOT EXISTS repo_download_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " - + diffMonth - + ", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_id ON repo_download_stats_monthly USING btree (id)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_month ON repo_download_stats_monthly USING btree(month)"; - stmt.executeUpdate(sql); - - sql = "CREATE INDEX repo_download_stats_monthly_source ON repo_download_stats_monthly USING btree(source)"; - stmt.executeUpdate(sql); - - sql = "CREATE OR REPLACE view repo_download_stats_monthly_sushi AS SELECT id, sum(number_of_downloads), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_download_stats group by id, month, source;"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().commit(); - ConnectDB.getHiveConnection().close(); - } - public void processPortalLog() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); @@ -1154,47 +793,17 @@ public class PiwikStatsDB { String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; stmt.executeUpdate(sql); - - logger.info("Inserting data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to pageviews_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Creating usage_stats table"); - String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "AS SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(createUsageStats); - logger.info("Created usage_stats table"); - - /* - * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); - */ + logger.info("Dropping piwiklogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp"); + + logger.info("Dropping process_portal_log_tmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp"); + stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java index 54ed286cb..4ca20c52e 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java @@ -285,7 +285,7 @@ public class SarcStats { stmtHive = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - +/* logger.info("Creating downloads_stats table_tmp"); String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp " @@ -366,7 +366,7 @@ public class SarcStats { + "`count` int)"; stmtHive.executeUpdate(createSushilog); logger.info("Created sushilog table"); - +*/ // Insert into sushilog logger.info("Inserting into sushilog"); String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index c4ee7d63c..bf2187569 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -56,9 +56,14 @@ public class UsageStatsExporter { PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) + if (ExecuteWorkflow.recreateDbAndTables){ piwikstatsdb.recreateDBAndTables(); - ; + logger.info("DB-Tables-TmpTables are created "); + } +// else { +// piwikstatsdb.createTmpTables(); +// logger.info("TmpTables are created "); +// } logger.info("Initializing the download logs module"); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); @@ -77,7 +82,7 @@ public class UsageStatsExporter { ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); } logger.info("Downloaded piwik logs"); -/* + // Create DB tables, insert/update statistics String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; piwikstatsdb.setCounterRobotsURL(cRobotsUrl); @@ -86,7 +91,7 @@ public class UsageStatsExporter { logger.info("Processing logs"); piwikstatsdb.processLogs(); } -*/ + logger.info("Creating LaReferencia tables"); LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, ExecuteWorkflow.lareferenciaAuthToken); @@ -101,7 +106,8 @@ public class UsageStatsExporter { lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } -/* + + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if (ExecuteWorkflow.processLaReferenciaLogs) { @@ -109,7 +115,8 @@ public class UsageStatsExporter { lastats.processLogs(); logger.info("LaReferencia logs done"); } -*/ + + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); @@ -124,12 +131,15 @@ public class UsageStatsExporter { if (ExecuteWorkflow.irusDownloadReports) { irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } -/* + + if (ExecuteWorkflow.irusProcessStats) { irusstats.processIrusStats(); logger.info("Irus done"); } -*/ + + + SarcStats sarcStats = new SarcStats(); if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); @@ -137,13 +147,14 @@ public class UsageStatsExporter { if (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); } -/* + + if (ExecuteWorkflow.sarcProcessStats) { sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); -*/ + /* // finalize usagestats @@ -160,6 +171,7 @@ public class UsageStatsExporter { invalidateMetadata(); } */ + logger.info("End"); } diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml new file mode 100644 index 000000000..f400239f5 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -0,0 +1,79 @@ + + + + + + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-usage-stats-build + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-usage-stats-build + diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java new file mode 100644 index 000000000..8f0f8eae7 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -0,0 +1,130 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageRawDataDBSchema; + private static String usageStatsDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getUsageRawDataDBSchema() { + return usageRawDataDBSchema; + } + + public static String getUsageStatsDBSchema() { + return ConnectDB.usageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + return cpds.getConnection(); + + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java new file mode 100644 index 000000000..3f958abba --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java @@ -0,0 +1,172 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; + static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; + static String lareferenciaBaseURL; + static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageRawDataDBSchema; + static String usageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + + static boolean piwikEmptyDirs; + static boolean downloadPiwikLogs; + static boolean processPiwikLogs; + + static Calendar startingLogPeriod; + static Calendar endingLogPeriod; + static int numberOfPiwikIdsToDownload; + static int numberOfSiteIdsToDownload; + + static boolean laReferenciaEmptyDirs; + static boolean downloadLaReferenciaLogs; + static boolean processLaReferenciaLogs; + + static boolean irusCreateTablesEmptyDirs; + static boolean irusDownloadReports; + static boolean irusProcessStats; + static int irusNumberOfOpendoarsToDownload; + + static boolean sarcCreateTablesEmptyDirs; + static boolean sarcDownloadReports; + static boolean sarcProcessStats; + static int sarcNumberOfIssnToDownload; + + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; + + static int numberOfDownloadThreads; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + matomoAuthToken = parser.get("matomoAuthToken"); + matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); + irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); + lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); + lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { + processPiwikLogs = true; + } else { + processPiwikLogs = false; + } + + String startingLogPeriodStr = parser.get("startingLogPeriod"); + Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); + startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + + String endingLogPeriodStr = parser.get("endingLogPeriod"); + Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); + endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + + numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); + numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { + recreateDbAndTables = true; + } else { + recreateDbAndTables = false; + } + + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { + processLaReferenciaLogs = true; + } else { + processLaReferenciaLogs = false; + } + + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { + irusProcessStats = true; + } else { + irusProcessStats = false; + } + + irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { + sarcProcessStats = true; + } else { + sarcProcessStats = false; + } + sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); + + if (parser.get("finalizeStats").toLowerCase().equals("true")) { + finalizeStats = true; + } else { + finalizeStats = false; + } + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { + finalTablesVisibleToImpala = true; + } else { + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + } + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + + private static Calendar startingLogPeriodStr(Date date) { + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; + + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java new file mode 100644 index 000000000..4f34adc04 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java @@ -0,0 +1,71 @@ +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class IrusStats { + + private String irusUKURL; + + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + + public IrusStats() throws Exception { + } + + + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + + logger.info("Creating irus_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".irus_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created irus_downloads_stats_tmp table"); + + logger.info("Inserting into irus_downloads_stats_tmp"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into irus_downloads_stats_tmp"); + + stmt.close(); + //ConnectDB.getHiveConnection().close(); + } + + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java new file mode 100644 index 000000000..ea3ac5948 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java @@ -0,0 +1,149 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaStats { + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); + + private String logRepoPath; + + private Statement stmt = null; + + private String CounterRobotsURL; + private ArrayList robotsList; + + public LaReferenciaStats() throws Exception { + } + + + + public void processLogs() throws Exception { + try { + logger.info("LaReferencia creating viewsStats"); + viewsStats(); + logger.info("LaReferencia created viewsStats"); + logger.info("LaReferencia creating downloadsStats"); + downloadsStats(); + logger.info("LaReferencia created downloadsStats"); + +// logger.info("LaReferencia updating Production Tables"); +// updateProdTables(); +// logger.info("LaReferencia updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + + public void viewsStats() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating la_result_views_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS " + + + "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + + "THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='action' and " + + "(source_item_type='oaItem' or source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created la_result_views_monthly_tmp view"); + + logger.info("Dropping la_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped la_views_stats_tmp table"); + + logger.info("Creating la_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " + + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.oid AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month"; + stmt.executeUpdate(sql); + logger.info("Created la_views_stats_tmp table"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } + + private void downloadsStats() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating la_result_downloads_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".la_result_downloads_monthly_tmp AS " + + "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + + "THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='download' and " + + "(source_item_type='oaItem' or source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created la_result_downloads_monthly_tmp view"); + + logger.info("Dropping la_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".la_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped la_downloads_stats_tmp table"); + + logger.info("Creating la_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " + + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.oid AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month"; + stmt.executeUpdate(sql); + logger.info("Created la_downloads_stats_tmp table"); + + stmt.close(); + //ConnectDB.getHiveConnection().close(); + } + + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java new file mode 100644 index 000000000..a165c6eab --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -0,0 +1,345 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.sql.Timestamp; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB { + + private String logPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + + public PiwikStatsDB() throws Exception { + + } + + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + // The piwiklog table is not needed since it is built + // on top of JSON files + ////////////this.createTmpTables(); + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + + public void processLogs() throws Exception { + try { + + logger.info("ViewsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + viewsStats(); + logger.info("ViewsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + + logger.info("DownloadsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + downloadsStats(); + logger.info("DownloadsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + + + public void viewsStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping openaire_result_views_monthly_tmp view"); + String drop_result_views_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_piwikresult_views_monthly_tmp"; + stmt.executeUpdate(drop_result_views_monthly); + logger.info("Dropped openaire_result_views_monthly_tmp view"); + + logger.info("Creating openaire_result_views_monthly_tmp view"); + String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_views_monthly_tmp " + + "AS SELECT entity_id AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + + "AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".piwiklog where action='action' and (source_item_type='oaItem' or " + + "source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(create_result_views_monthly); + logger.info("Created openaire_result_views_monthly_tmp table"); + + logger.info("Dropping openaire_views_stats_tmp table"); + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp"; + stmt.executeUpdate(drop_views_stats); + logger.info("Dropped openaire_views_stats_tmp table"); + + logger.info("Creating openaire_views_stats_tmp table"); + String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(create_views_stats); + logger.info("Created openaire_views_stats_tmp table"); + + logger.info("Creating openaire_pageviews_stats_tmp table"); + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_pageviews_stats_tmp AS SELECT " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(create_pageviews_stats); + logger.info("Created pageviews_stats table"); + + stmt.close(); + //ConnectDB.getHiveConnection().close(); + } + + private void downloadsStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping openaire_result_downloads_monthly_tmp view"); + String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp"; + stmt.executeUpdate(drop_result_downloads_monthly); + logger.info("Dropped openaire_result_downloads_monthly_tmp view"); + + logger.info("Creating openaire_result_downloads_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " + + "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " + + "AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "ORDER BY source, entity_id, month"; + stmt.executeUpdate(sql); + logger.info("Created openaire_result_downloads_monthly_tmp view"); + + logger.info("Dropping openaire_downloads_stats_tmp table"); + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp"; + stmt.executeUpdate(drop_views_stats); + logger.info("Dropped openaire_downloads_stats_tmp table"); + + logger.info("Creating openaire_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id and p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(sql); + logger.info("Created downloads_stats table"); + + + logger.info("Dropping openaire_result_downloads_monthly_tmp view"); + sql = "DROP VIEW IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; + logger.info("Dropped openaire_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + stmt.close(); + //ConnectDB.getHiveConnection().close(); + } + + public void finalizeStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + //Dropping views_stats table + logger.info("Dropping views_stats table"); + String sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + logger.info("Dropped views_stats table "); + stmt.executeUpdate(sql); + + //Dropping downloads_stats table + logger.info("Dropping downloads_stats table"); + sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + logger.info("Dropped downloads_stats table "); + stmt.executeUpdate(sql); + + //Dropping page_views_stats table + logger.info("Dropping pageviews_stats table"); + sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + logger.info("Dropped pageviews_stats table "); + stmt.executeUpdate(sql); + + //Creating views_stats table + logger.info("Creating views_stats table"); + String createViewsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".views_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(createViewsStats); + logger.info("Created views_stats table"); + + //Inserting OpenAIRE views stats + logger.info("Inserting Openaire data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Openaire views updated to views_stats"); + + //Inserting Lareferencia views stats + logger.info("Inserting LaReferencia data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("LaReferencia views updated to views_stats"); + + + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); + + //Inserting OpenAIRE downloads stats + logger.info("Inserting OpenAIRE data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; + stmt.executeUpdate(sql); + + //Inserting Lareferencia downloads stats + logger.info("Inserting LaReferencia data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Lareferencia downloads updated to downloads_stats"); + + //Inserting IRUS downloads stats + logger.info("Inserting IRUS data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("IRUS downloads updated to downloads_stats"); + + //Inserting SARC-OJS downloads stats + logger.info("Inserting SARC data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("SARC-OJS downloads updated to downloads_stats"); + + + logger.info("Creating pageviews_stats table"); + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pageviews_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(create_pageviews_stats); + logger.info("Created pageviews_stats table"); + + //Inserting OpenAIRE views stats from Portal + logger.info("Inserting data to page_views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping full_dates table"); + String dropFullDates = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".full_dates"; + stmt.executeUpdate(dropFullDates); + logger.info("Dropped full_dates table"); + + Calendar startCalendar = Calendar.getInstance(); + startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); + Calendar endCalendar = Calendar.getInstance(); + int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); + int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); + + logger.info("Creating full_dates table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + + "FROM (SELECT DATE '2016-01-01' AS from_date) p " + + "LATERAL VIEW " + + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; + stmt.executeUpdate(sql); + logger.info("Created full_dates table"); + + + logger.info("Inserting data to usage_stats"); + sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + + "SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats"); + + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } + + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java new file mode 100644 index 000000000..2d224075f --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java @@ -0,0 +1,106 @@ +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class SarcStats { + + private Statement stmtHive = null; + private Statement stmtImpala = null; + + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + + public SarcStats() throws Exception { +// createTables(); + } + + private void createTables() throws Exception { + try { + + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void processSarc() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating sarc_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created sarc_downloads_stats_tmp table"); + + logger.info("Inserting into sarc_downloads_stats_tmp"); + String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + stmt.executeUpdate(insertSarcStats); + logger.info("Inserted into sarc_downloads_stats_tmp"); + + stmt.close(); + //ConnectDB.getHiveConnection().close(); + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java new file mode 100644 index 000000000..43abb1681 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -0,0 +1,106 @@ +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + +// runImpalaQuery(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); + + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) { + piwikstatsdb.recreateDBAndTables(); + logger.info("DB-Tables are created "); + } +// else { +// piwikstatsdb.createTmpTables(); +// logger.info("TmpTables are created "); +// } + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } + + LaReferenciaStats lastats = new LaReferenciaStats(); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } + + IrusStats irusstats = new IrusStats(); + + if (ExecuteWorkflow.irusProcessStats) { + logger.info("Processing IRUS"); + irusstats.processIrusStats(); + logger.info("Irus done"); + } + + SarcStats sarcStats = new SarcStats(); + + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(); + } + logger.info("Sarc done"); + + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json new file mode 100644 index 000000000..3f121288e --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json @@ -0,0 +1,237 @@ +[ + { + "paramName": "mat", + "paramLongName": "matomoAuthToken", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "mbu", + "paramLongName": "matomoBaseURL", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukbuw", + "paramLongName": "irusUKBaseURL", + "paramDescription": "working directory", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lbu", + "paramLongName": "lareferenciaBaseURL", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lat", + "paramLongName": "lareferenciaAuthToken", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "urdbs", + "paramLongName": "usageRawDataDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "piwikEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dpwl", + "paramLongName": "downloadPiwikLogs", + "paramDescription": "download piwik logs?", + "paramRequired": true + }, + { + "paramName": "slp", + "paramLongName": "startingLogPeriod", + "paramDescription": "Starting log period", + "paramRequired": true + }, + { + "paramName": "elp", + "paramLongName": "endingLogPeriod", + "paramDescription": "Ending log period", + "paramRequired": true + }, + { + "paramName": "npidd", + "paramLongName": "numberOfPiwikIdsToDownload", + "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", + "paramRequired": true + }, + { + "paramName": "nsidd", + "paramLongName": "numberOfSiteIdsToDownload", + "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", + "paramRequired": true + }, + { + "paramName": "lerd", + "paramLongName": "laReferenciaEmptyDirs", + "paramDescription": "Empty LaReferencia directories?", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dlrl", + "paramLongName": "downloadLaReferenciaLogs", + "paramDescription": "download La Referencia logs?", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "irusCreateTablesEmptyDirs", + "paramDescription": "Irus section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "irusDownloadReports", + "paramDescription": "Irus section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "irusNumberOfOpendoarsToDownload", + "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "sarcCreateTablesEmptyDirs", + "paramDescription": "Sarc section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "sarcDownloadReports", + "paramDescription": "Sarc section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "sarcNumberOfIssnToDownload", + "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", + "paramRequired": true + }, + + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml new file mode 100644 index 000000000..37700539b --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -0,0 +1,91 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow + --matomoAuthToken${matomoAuthToken} + --matomoBaseURL${matomoBaseURL} + --repoLogPath${repoLogPath} + --portalLogPath${portalLogPath} + --portalMatomoID${portalMatomoID} + --irusUKBaseURL${irusUKBaseURL} + --irusUKReportPath${irusUKReportPath} + --sarcsReportPathArray${sarcsReportPathArray} + --sarcsReportPathNonArray${sarcsReportPathNonArray} + --lareferenciaLogPath${lareferenciaLogPath} + --lareferenciaBaseURL${lareferenciaBaseURL} + --lareferenciaAuthToken${lareferenciaAuthToken} + --dbHiveUrl${hiveJdbcUrl} + --dbImpalaUrl${impalaJdbcUrl} + --usageRawDataDBSchema${usageRawDataDBSchema} + --usageStatsDBSchema${usageStatsDBSchema} + --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} + --piwikEmptyDirs${piwikEmptyDirs} + --downloadPiwikLogs${downloadPiwikLogs} + --processPiwikLogs${processPiwikLogs} + --startingLogPeriod${startingLogPeriod} + --endingLogPeriod${endingLogPeriod} + --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} + --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} + --laReferenciaEmptyDirs${laReferenciaEmptyDirs} + --downloadLaReferenciaLogs${downloadLaReferenciaLogs} + --processLaReferenciaLogs${processLaReferenciaLogs} + --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} + --irusDownloadReports${irusDownloadReports} + --irusProcessStats${irusProcessStats} + --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} + --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} + --sarcDownloadReports${sarcDownloadReports} + --sarcProcessStats${sarcProcessStats} + --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} + --finalizeStats${finalizeStats} + --finalTablesVisibleToImpala${finalTablesVisibleToImpala} + --numberOfDownloadThreads${numberOfDownloadThreads} + + + + + + + + From dc9c2f3272730b60dbf2def110bb7d2db67f9e42 Mon Sep 17 00:00:00 2001 From: Dimitris Date: Sat, 12 Dec 2020 12:00:14 +0200 Subject: [PATCH 05/16] Commit 12122020 --- .../nb-configuration.xml | 18 + .../dhp-usage-datasets-stats-update/pom.xml | 35 +- .../runworkflow.sh | 1 + .../datasetsusagestats/export/ConnectDB.java | 87 +- .../export/DatasetsStatsDB.java | 168 ++++ .../DownloadReportsListFromDatacite.java | 199 ++-- .../export/ExecuteWorkflow.java | 25 +- .../export/ReadReportsListFromDatacite.java | 408 ++++++++ .../export/UsageStatsExporter.java | 275 ++---- .../dhp-usage-raw-data-update/pom.xml | 64 +- .../dhp-usage-raw-data-update/runworkflow.sh | 1 + .../graph/usagerawdata/export/ConnectDB.java | 2 +- .../usagerawdata/export/ExecuteWorkflow.java | 87 +- .../graph/usagerawdata/export/IrusStats.java | 619 ++++++------- .../export/LaReferenciaDownloadLogs.java | 336 +++---- .../export/LaReferenciaStats.java | 159 +--- .../export/PiwikDownloadLogs.java | 483 +++++----- .../usagerawdata/export/PiwikStatsDB.java | 672 +++++++------- .../graph/usagerawdata/export/SarcStats.java | 875 ++++++++---------- .../export/UsageStatsExporter.java | 96 +- .../export/usagerawdata_parameters.json | 12 - .../graph/usagerawdata/oozie_app/workflow.xml | 2 - dhp-workflows/dhp-usage-stats-build/pom.xml | 30 +- .../dhp-usage-stats-build/runworkflow.sh | 1 + .../usagestatsbuild/export/ConnectDB.java | 175 ++-- .../export/ExecuteWorkflow.java | 237 +++-- .../usagestatsbuild/export/IrusStats.java | 66 +- .../export/LaReferenciaStats.java | 8 +- .../usagestatsbuild/export/PiwikStatsDB.java | 546 +++++++---- .../usagestatsbuild/export/SarcStats.java | 119 +-- .../export/UsageStatsExporter.java | 143 +-- .../export/usagestatsbuild_parameters.json | 359 +++---- .../usagestatsbuild/oozie_app/workflow.xml | 20 +- 33 files changed, 3306 insertions(+), 3022 deletions(-) create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml create mode 100755 dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java create mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java create mode 100755 dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh create mode 100755 dhp-workflows/dhp-usage-stats-build/runworkflow.sh diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml new file mode 100644 index 000000000..a65c4514a --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml @@ -0,0 +1,18 @@ + + + + + + JDK_1.8 + + diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml index 14b543a57..5593d4d87 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml @@ -23,7 +23,35 @@ 4.0.0 dhp-usage-datasets-stats-update - + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + UTF-8 UTF-8 @@ -68,6 +96,11 @@ dhp-common ${project.version} + + com.mchange + c3p0 + 0.9.5.2 + c3p0 c3p0 diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh new file mode 100755 index 000000000..9b4325508 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java index e6da7eff3..25b30e8ad 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java @@ -32,8 +32,8 @@ public abstract class ConnectDB { private static String datasetUsageStatsDBSchema; private static String statsDBSchema; private final static Logger logger = Logger.getLogger(ConnectDB.class); - private Statement stmt = null; - + private Statement stmt = null; + static void init() throws ClassNotFoundException { dbHiveUrl = ExecuteWorkflow.dbHiveUrl; @@ -79,6 +79,7 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbHiveUrl); + cpds.setUser("dimitris.pierrakos"); cpds.setAcquireIncrement(1); cpds.setMaxPoolSize(100); cpds.setMinPoolSize(1); @@ -93,10 +94,10 @@ public abstract class ConnectDB { cpds.setCheckoutTimeout(0); cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - return cpds.getConnection(); + logger.info("Opened database successfully"); + + return cpds.getConnection(); } @@ -107,6 +108,7 @@ public abstract class ConnectDB { */ ComboPooledDataSource cpds = new ComboPooledDataSource(); cpds.setJdbcUrl(dbImpalaUrl); + cpds.setUser("dimitris.pierrakos"); cpds.setAcquireIncrement(1); cpds.setMaxPoolSize(100); cpds.setMinPoolSize(1); @@ -122,81 +124,8 @@ public abstract class ConnectDB { cpds.setPreferredTestQuery("SELECT 1"); cpds.setIdleConnectionTestPeriod(60); - logger.info("Opened database successfully"); + logger.info("Opened database successfully"); return cpds.getConnection(); } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping logs DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Piwiklog table - This table should exist - String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) " - + "into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePiwikLog); - - ///////////////////////////////////////// - // Rule for duplicate inserts @ piwiklog - ///////////////////////////////////////// - - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log - ////////////////////////////////////////////////// - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } } -/* -CREATE TABLE IF NOT EXISTS dataciteReports (reportid STRING, - name STRING, - source STRING, - release STRING, - createdby STRING, - report_end_date STRING, - report_start_date STRING) - CLUSTERED BY (reportid) - into 100 buckets stored as orc tblproperties('transactional'='true'); -*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java new file mode 100644 index 000000000..88db1f819 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java @@ -0,0 +1,168 @@ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class DatasetsStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + } + +// public void reCreateLogDirs() throws IllegalArgumentException, IOException { +// FileSystem dfs = FileSystem.get(new Configuration()); +// +// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); +// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); +// +// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); +// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); +// +// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); +// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); +// +// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); +// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); +// } + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Reports table - This table should exist + logger.info("Creating Reports Table"); + String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports(reportid STRING, \n" + + " name STRING, \n" + + " source STRING,\n" + + " release STRING,\n" + + " createdby STRING,\n" + + " report_start_date STRING,\n" + + " report_end_date STRING)\n" + + " CLUSTERED BY (reportid)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + + stmt.executeUpdate(sqlCreateTableDataciteReports); + logger.info("Reports Table Created"); + + // Create Datasets Table + logger.info("Creating DataSets Table"); + String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasets(ds_type STRING,\n" + + " ds_title STRING,\n" + + " yop STRING,\n" + + " uri STRING,\n" + + " platform STRING,\n" + + " data_type STRING,\n" + + " publisher STRING,\n" + + " publisher_id_type STRING,\n" + + " publisher_id_value STRING,\n" + + " ds_dates_type STRING,\n" + + " ds_pub_date STRING,\n" + + " ds_contributors STRING,\n" + // + " ds_contributor_value array ,\n" + + " reportid STRING)\n" + + " CLUSTERED BY (ds_type)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableDataSets); + logger.info("DataSets Table Created"); + + // Create Datasets Performance Table + logger.info("Creating DataSetsPerformance Table"); + String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datasetsperformance(ds_type STRING,\n" + + " period_end STRING,\n" + + " period_from STRING,\n" + + " access_method STRING,\n" + + " metric_type STRING,\n" + + " count INT,\n" + + " country_counts STRING,\n" + + " reportid STRING)\n" + + " CLUSTERED BY (ds_type)\n" + + " into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableDataSetsPerformance); + logger.info("DataSetsPerformance Table Created"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java index 196238ea2..a73b299ec 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java @@ -1,97 +1,102 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import com.google.gson.JsonObject; -import java.util.ArrayList; -import java.util.Iterator; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * - * @author dpie - */ -public class DownloadReportsListFromDatacite { - - private String dataciteBaseURL; - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) throws MalformedURLException, Exception { - - this.dataciteBaseURL = dataciteBaseURL; - this.dataciteReportPath = dataciteReportPath; - } - - public void downloadReportsList() throws ParseException { - StringBuilder responseStrBuilder = new StringBuilder(); - - Gson gson = new Gson(); - - try { - BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); - logger.info("Downloading from " + dataciteBaseURL); - - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder.append(inputStr); - } - } catch (IOException e) { - logger.info(e.getMessage()); - } - JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); - JsonArray dataArray = jsonObject.getAsJsonArray("reports"); - ArrayList reportsList = new ArrayList(); - for (JsonElement element : dataArray) { - reportsList.add(element.getAsJsonObject().get("id").getAsString()); - } - - Iterator it = reportsList.iterator(); - while (it.hasNext()) { - String reportId = it.next().toString(); - String url = dataciteBaseURL + reportId; - - try { - BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - StringBuilder responseStrBuilder2 = new StringBuilder(); - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder2.append(inputStr); - } - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs.create(new Path(dataciteReportPath + "/" + reportId + ".json"), - true); - byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - fin.close(); - - fin.close(); - } catch (IOException e) { - System.out.println(e); - } - } - } -} +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +/** + * @author dpie + */ +public class DownloadReportsListFromDatacite { + + private String dataciteBaseURL; + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) + throws MalformedURLException, Exception { + + this.dataciteBaseURL = dataciteBaseURL; + this.dataciteReportPath = dataciteReportPath; + } + + public void downloadReportsList() throws ParseException { + StringBuilder responseStrBuilder = new StringBuilder(); + + Gson gson = new Gson(); + + try { + BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); + logger.info("Downloading from " + dataciteBaseURL); + + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder.append(inputStr); + } + } catch (IOException e) { + logger.info(e.getMessage()); + } + JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); + JsonArray dataArray = jsonObject.getAsJsonArray("reports"); + ArrayList reportsList = new ArrayList(); + for (JsonElement element : dataArray) { + reportsList.add(element.getAsJsonObject().get("id").getAsString()); + } + + Iterator it = reportsList.iterator(); + while (it.hasNext()) { + String reportId = it.next().toString(); + String url = dataciteBaseURL + reportId; + + try { + BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String inputStr; + StringBuilder responseStrBuilder2 = new StringBuilder(); + while ((inputStr = streamReader.readLine()) != null) { + responseStrBuilder2.append(inputStr); + } + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path(dataciteReportPath + "/" + reportId + ".json"), + true); + byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + fin.close(); + + fin.close(); + } catch (IOException e) { + System.out.println(e); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java index 7b3db3115..b28578e4b 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java @@ -18,14 +18,13 @@ public class ExecuteWorkflow { static String dataciteBaseURL; static String dataciteReportPath; - static String dbHiveUrl; - static String dbImpalaUrl; - static String datasetUsageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - static boolean datasetsEmptyDirs; - static boolean finalTablesVisibleToImpala; - + static String dbHiveUrl; + static String dbImpalaUrl; + static String datasetUsageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + static boolean datasetsEmptyDirs; + static boolean finalTablesVisibleToImpala; public static void main(String args[]) throws Exception { @@ -58,11 +57,11 @@ public class ExecuteWorkflow { else datasetsEmptyDirs = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - +// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) +// finalTablesVisibleToImpala = true; +// else +// finalTablesVisibleToImpala = false; +// UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java new file mode 100644 index 000000000..ccb3eebd3 --- /dev/null +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java @@ -0,0 +1,408 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.datasetsusagestats.export; + +import java.io.*; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.sql.Array; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Iterator; +import java.util.Set; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +/** + * @author dpie + */ +public class ReadReportsListFromDatacite { + + private String dataciteReportPath; + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception { + + this.dataciteReportPath = dataciteReportPath; + } + + public void readReports() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + File folder = new File(dataciteReportPath); + ArrayList jsonFiles = listHdfsDir(dataciteReportPath); + for (String jsonFile : jsonFiles) { + logger.info("Reading report file " + jsonFile); + this.createTmpReportsTable(jsonFile); + + String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelectReportID); + ResultSet rstmpReportID = stmt.getResultSet(); + + String reportID = null; + while (rstmpReportID.next()) { + reportID = rstmpReportID.getString(1); + } + + logger.info("Checking report with id " + reportID); + String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".datacitereports where reportid=?"; + PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); + stGetReportID.setString(1, reportID); + + ResultSet rsCheckIfReportExist = stGetReportID.executeQuery(); + + if (rsCheckIfReportExist.next()) { + logger.info("Report found with ID " + reportID); + dropTmpReportsTable(); + } else { + String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datacitereports " + + "SELECT\n" + + " get_json_object(json, '$.report.id') AS reportid,\n" + + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" + + " get_json_object(json, '$.report.report-header.report-id') AS source,\n" + + " get_json_object(json, '$.report.report-header.release') AS release,\n" + + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n" + + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertReport); + + logger.info("Report added"); + + logger.info("Adding datasets"); + String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsArray); + ResultSet rstmpReportDatasets = stmt.getResultSet(); + + if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) { + String[] listDatasets = rstmpReportDatasets.getString(1).split(","); + logger.info("Datasets found " + listDatasets.length); + + for (int i = 0; i < listDatasets.length; i++) { + + String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datasets " + + "SELECT\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-id[0].value') AS ds_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-title') AS ds_title,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].publisher-id.type[0]') AS publisher_id_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].publisher-id.value[0]') AS publisher_id_value,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-dates.type[0]') AS ds_dates_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-dates.value[0]') AS ds_dates_value,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-contributors') AS ds_contributors,\n" + + " get_json_object(json, '$.report.id') AS reportid \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertDataSets); + + logger.info("Dataset added " + i); + + logger.info("Adding Dataset Performance"); + String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets[" + + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsPerformance); + ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet(); + if (rstmpReportDatasetsPerformance.next() + && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) { + String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(","); + logger.info("Datasets Performance found " + listDatasetsPerformance.length); + for (int j = 0; j < listDatasetsPerformance.length; j++) { + String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets[" + + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjson"; + stmt.execute(sqlSelecteDatasetsPerformanceInstance); + ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet(); + if (rstmpReportDatasetsPerformanceInstance.next() + && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) { + String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance + .getString(1) + .split(","); + logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length); + for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) { + String sqlInsertDataSetsPerformance = "INSERT INTO " + + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance " + + "SELECT\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].dataset-id[0].value') AS ds_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].period.end-date') AS period_end,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].period.begin-date') AS period_from,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].access-method') AS access_method,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].metric-type') AS metric_type,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + "].count') AS count,\n" + + " get_json_object(json, '$.report.report-datasets[" + i + + "].performance[" + j + "].instance[" + k + + "].country-counts') AS country_counts,\n" + + " get_json_object(json, '$.report.id') AS reportid \n" + + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlInsertDataSetsPerformance); + } + } + } + } + logger.info("DatasetPerformance added for dataset" + i); + } + } + logger.info("Adding gzip performance"); + String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(sqlSelecteReportSubsets); + ResultSet rstmpReportSubsets = stmt.getResultSet(); + if (rstmpReportSubsets.next()) { + String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1)); + this.readCompressedReport(unCompressedReport, reportID); + } + } + } + this.dropTmpReportsTable(); + } + + public void readCompressedReport(String report, String reportId) throws Exception { + Gson gson = new Gson(); + JsonObject jsonObject = gson.fromJson(report, JsonObject.class); + + JsonArray jsonReportDatasets; + if (jsonObject.getAsJsonArray("report_datasets") != null) { + jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets"); + } else { + jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets"); + } + + for (JsonElement datasetElement : jsonReportDatasets) { + // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title"); + String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString(); + String yop = datasetElement.getAsJsonObject().get("yop").getAsString(); + String uri = datasetElement.getAsJsonObject().get("uri").getAsString(); + String platform = datasetElement.getAsJsonObject().get("platform").getAsString(); + String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString(); + String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString(); + + JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id"); + String publisher_id_type = ""; + String publisher_id_value = ""; + for (JsonElement publisher_id_Element : publisher_id) { + publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString(); + publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString(); + } + JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates"); + String ds_dates_type = ""; + String ds_dates_value = ""; + for (JsonElement datasetDaysElement : dataset_days) { + ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString(); + ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString(); + } + + JsonArray datasetContributors = null; + String ds_contributor_type = ""; + String[] ds_contributor_values = null; + Array ds_contributor_valuesArr = null; + + if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) { + datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors"); + + JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id"); + String doi = ""; + for (JsonElement datasetIDElement : datasetid) +//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString()); + { + doi = datasetIDElement.getAsJsonObject().get("value").getAsString(); + } + + String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() + + " .datasets(ds_type," + + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value," + + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) "; + + PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset); + + pstmtDataset.setString(1, doi); + pstmtDataset.setString(2, dataset_title); + pstmtDataset.setString(3, yop); + pstmtDataset.setString(4, uri); + pstmtDataset.setString(5, platform); + pstmtDataset.setString(6, data_type); + pstmtDataset.setString(7, publisher); + pstmtDataset.setString(8, publisher_id_type); + pstmtDataset.setString(9, publisher_id_value); + pstmtDataset.setString(10, ds_dates_type); + pstmtDataset.setString(11, ds_dates_value); + pstmtDataset.setString(13, datasetContributors.getAsString()); + pstmtDataset.setString(14, reportId); + + pstmtDataset.execute(); + logger.info("Dataset from compressed report addded " + doi); + /* + * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for + * (JsonElement performanceElement : performance) { JsonObject period = + * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date = + * period.getAsJsonObject().get("end-date").getAsString(); String begin_date = + * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance = + * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement : + * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject + * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set + * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[] + * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0; + * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] = + * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text", + * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype + * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod = + * instanceElement.getAsJsonObject().get("access-method").getAsString(); String + * sqlInsertDatasetPerformance = + * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)" + * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance); + * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count); + * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date); + * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod); + * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count); + * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8, + * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute(); + * } } + */ + } + } + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir)); + throw new Exception("HDFS file path with exported data does not exist : " + dir, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + sb.append(line); + // sb.append(line); + line = br.readLine(); + } + // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + result = sb.toString().trim(); + // fs.close(); + } catch (Exception e) { + throw new Exception(e); + } + + return result; + } + + public static String uncompressString(String zippedBase64Str) + throws IOException { + String result = null; + + // In my solr project, I use org.apache.solr.common.util.Base64. + // byte[] bytes = + // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str); + byte[] bytes = Base64.getDecoder().decode(zippedBase64Str); + GZIPInputStream zi = null; + try { + zi = new GZIPInputStream(new ByteArrayInputStream(bytes)); + result = IOUtils.toString(zi); + } finally { + IOUtils.closeQuietly(zi); + } + return result; + } + + private void createTmpReportsTable(String jsonFile) throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + dropTmpReportsTable(); + String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() + + ".tmpjson (json STRING)"; + stmt.executeUpdate(createTmpTable); + logger.info("Tmp Table Created"); + + String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE " + + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + stmt.execute(insertJsonReport); + logger.info("JSON Report File inserted to tmpjson Table"); + } + + private void dropTmpReportsTable() throws SQLException { + logger.info("Dropping tmpjson Table"); + String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + stmt.executeUpdate(dropTmpTable); + logger.info("Dropped tmpjson Table"); + + } + +} + +/* + * PreparedStatement prepStatem = conn. + * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)" + * ); + */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java index 28c4f30a1..7b07fbc25 100644 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.datasetsusagestats.export; import java.io.IOException; @@ -17,220 +18,94 @@ import org.slf4j.LoggerFactory; */ public class UsageStatsExporter { - private Statement stmt = null; + private Statement stmt = null; - public UsageStatsExporter() { + public UsageStatsExporter() { - } + } - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); + logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); - logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); + logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); - } + } - public void export() throws Exception { + public void export() throws Exception { - logger.info("Initialising DB properties"); - ConnectDB.init(); - ConnectDB.getHiveConnection(); + logger.info("Initialising DB properties"); + ConnectDB.init(); + ConnectDB.getHiveConnection(); - if (ExecuteWorkflow.recreateDbAndTables) { - createDatabase(); - createTables(); - reCreateLogDirs(); - } - logger.info("Initializing the download logs module"); - DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, ExecuteWorkflow.dataciteReportPath); + if (ExecuteWorkflow.recreateDbAndTables) { + DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", ""); + datasetsDB.recreateDBAndTables(); + } + logger.info("Initializing the download logs module"); + DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, + ExecuteWorkflow.dataciteReportPath); - if (ExecuteWorkflow.datasetsEmptyDirs) { - logger.info("Downloading Reports List From Datacite"); - drfd.downloadReportsList(); - logger.info("Reports List has been downloaded"); - } - } + if (ExecuteWorkflow.datasetsEmptyDirs) { + logger.info("Downloading Reports List From Datacite"); + drfd.downloadReportsList(); + logger.info("Reports List has been downloaded"); + } - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating datasetUsageStats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Reports table - This table should exist - String sqlCreateTableDataciteeReports = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" - + " name STRING, \n" - + " source STRING,\n" - + " release STRING,\n" - + " createdby STRING,\n" - + " report_end_date STRING,\n" - + " report_start_date STRING)\n" - + " CLUSTERED BY (reportid)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - - stmt.executeUpdate(sqlCreateTableDataciteeReports); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite( + ExecuteWorkflow.dataciteReportPath); + logger.info("Store Reports To DB"); + readReportsListFromDatacite.readReports(); + logger.info("Reports Stored To DB"); + } // runImpalaQuery(); -/* - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - - logger.info("Re-creating database and tables"); - - logger.info("Initializing the download logs module"); - PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - - if (ExecuteWorkflow.piwikEmptyDirs) { - logger.info("Recreating Piwik log directories"); - piwikstatsdb.reCreateLogDirs(); - } - - // Downloading piwik logs (also managing directory creation) - if (ExecuteWorkflow.downloadPiwikLogs) { - logger.info("Downloading piwik logs"); - piwd - .GetOpenAIRELogs( - ExecuteWorkflow.repoLogPath, - ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); - } - logger.info("Downloaded piwik logs"); - - // Create DB tables, insert/update statistics - String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - piwikstatsdb.setCounterRobotsURL(cRobotsUrl); - - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } - - logger.info("Creating LaReferencia tables"); - LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - ExecuteWorkflow.lareferenciaAuthToken); - - if (ExecuteWorkflow.laReferenciaEmptyDirs) { - logger.info("Recreating LaReferencia log directories"); - lrf.reCreateLogDirs(); - } - - if (ExecuteWorkflow.downloadLaReferenciaLogs) { - logger.info("Downloading LaReferencia logs"); - lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); - logger.info("Downloaded LaReferencia logs"); - } - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); - - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); - if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { - logger.info("Creating Irus Stats tables"); - irusstats.createTables(); - logger.info("Created Irus Stats tables"); - - logger.info("Re-create log dirs"); - irusstats.reCreateLogDirs(); - logger.info("Re-created log dirs"); - } - - if (ExecuteWorkflow.irusDownloadReports) { - irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); - } - if (ExecuteWorkflow.irusProcessStats) { - irusstats.processIrusStats(); - logger.info("Irus done"); - } - - SarcStats sarcStats = new SarcStats(); - if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { - sarcStats.reCreateLogDirs(); - } - if (ExecuteWorkflow.sarcDownloadReports) { - sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); - } - logger.info("Sarc done"); - - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - */ + /* + * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module"); + * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories"); + * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if + * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs( + * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); } + * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl = + * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) { + * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables"); + * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) { + * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if + * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs"); + * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } + * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if + * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs(); + * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if + * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); + * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs"); + * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) { + * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) { + * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if + * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if + * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, + * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) { + * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if + * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the + * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) { + * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End"); + */ } /* - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } + * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt = + * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + + * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close(); + * ConnectDB.getHiveConnection().close(); } */ diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 338b2a2c5..44f28ff56 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -2,13 +2,13 @@ - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 0.13.1-cdh5.2.1 2.5.0-cdh5.2.1 - - + + org.apache.spark @@ -53,16 +81,16 @@ 20180130 jar - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + eu.dnetlib.dhp dhp-common diff --git a/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh new file mode 100755 index 000000000..4465dae21 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java index f76644c83..5b2e6804b 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java @@ -122,4 +122,4 @@ public abstract class ConnectDB { } -} +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java index 81e34b3e7..e0e0d3687 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -62,7 +62,6 @@ public class ExecuteWorkflow { static int sarcNumberOfIssnToDownload; static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; static int numberOfDownloadThreads; @@ -98,98 +97,108 @@ public class ExecuteWorkflow { usageStatsDBSchema = parser.get("usageStatsDBSchema"); statsDBSchema = parser.get("statsDBSchema"); - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { recreateDbAndTables = true; - else + } else { recreateDbAndTables = false; + } - if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) + if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) { piwikEmptyDirs = true; - else + } else { piwikEmptyDirs = false; + } - if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) + if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) { downloadPiwikLogs = true; - else + } else { downloadPiwikLogs = false; + } - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { processPiwikLogs = true; - else + } else { processPiwikLogs = false; + } - String startingLogPeriodStr = parser.get("startingLogPeriod"); + String startingLogPeriodStr = parser.get("startingLogPeriod"); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); - if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) + if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) { laReferenciaEmptyDirs = true; - else + } else { laReferenciaEmptyDirs = false; + } - if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) + if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) { downloadLaReferenciaLogs = true; - else + } else { downloadLaReferenciaLogs = false; + } - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { processLaReferenciaLogs = true; - else + } else { processLaReferenciaLogs = false; + } - if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) { irusCreateTablesEmptyDirs = true; - else + } else { irusCreateTablesEmptyDirs = false; + } - if (parser.get("irusDownloadReports").toLowerCase().equals("true")) + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) { irusDownloadReports = true; - else + } else { irusDownloadReports = false; + } - if (parser.get("irusProcessStats").toLowerCase().equals("true")) + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { irusProcessStats = true; - else + } else { irusProcessStats = false; + } irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); - if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) { sarcCreateTablesEmptyDirs = true; - else + } else { sarcCreateTablesEmptyDirs = false; + } - if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) + if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) { sarcDownloadReports = true; - else + } else { sarcDownloadReports = false; + } - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { sarcProcessStats = true; - else + } else { sarcProcessStats = false; + } sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); -/* - if (parser.get("finalizeStats").toLowerCase().equals("true")) + if (parser.get("finalizeStats").toLowerCase().equals("true")) { finalizeStats = true; - else + } else { finalizeStats = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; -*/ + } + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); + // usagestatsExport.createdDBWithTablesOnly(); } private static Calendar startingLogPeriodStr(Date date) { diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java index bb8d8565e..7ec5b0fca 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -27,393 +28,331 @@ import org.slf4j.LoggerFactory; */ public class IrusStats { - private String irusUKURL; + private String irusUKURL; - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - public IrusStats(String irusUKURL) throws Exception { - this.irusUKURL = irusUKURL; - // The following may not be needed - It will be created when JSON tables are created + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created // createTmpTables(); - } + } - public void reCreateLogDirs() throws Exception { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - } + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } - public void createTables() throws Exception { - try { - logger.info("Creating sushilog"); - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog(source STRING, " - + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " - + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableSushiLog); - logger.info("Created sushilog"); + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); - // To see how to apply to the ignore duplicate rules and indexes -// stmt.executeUpdate(sqlCreateTableSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilog " -// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," -// + "sushilog.rid, sushilog.date " -// + "FROM sushilog " -// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; -// stmt.executeUpdate(createSushiIndex); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } -// // The following may not be needed - It will be created when JSON tables are created -// private void createTmpTables() throws Exception { -// try { -// -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; -// stmt.executeUpdate(sqlCreateTableSushiLog); -// -// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); -// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; -// // stmt.executeUpdate(sqlCopyPublicSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilogtmp " -// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," -// + "sushilogtmp.rid, sushilogtmp.date " -// + "FROM sushilogtmp " -// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// -// stmt.close(); -// ConnectDB.getConnection().close(); -// log.info("Sushi Tmp Tables Created"); -// } catch (Exception e) { -// log.error("Failed to create tables: " + e); -// throw new Exception("Failed to create tables: " + e.toString(), e); -// } -// } - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); - logger.info("Dropping sushilogtmp_json table"); - String dropSushilogtmpJson = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sushilogtmp_json"; - stmt.executeUpdate(dropSushilogtmpJson); - logger.info("Dropped sushilogtmp_json table"); + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); - logger.info("Creating irus_sushilogtmp_json table"); - String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " Type: STRING,\n" - + " Value: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` ARRAY<\n" - + " struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + " >\n" - + ")\n" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(createSushilogtmpJson); - logger.info("Created irus_sushilogtmp_json table"); + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); - logger.info("Dropping irus_sushilogtmp table"); - String dropSushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(dropSushilogtmp); - logger.info("Dropped irus_sushilogtmp table"); + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); - logger.info("Creating irus_sushilogtmp table"); - String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(createSushilogtmp); - logger.info("Created irus_sushilogtmp table"); + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); - logger.info("Inserting to irus_sushilogtmp table"); - String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " - + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " - + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " - + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " - + "WHERE `ItemIdent`.`Type`= 'OAI'"; - stmt.executeUpdate(insertSushilogtmp); - logger.info("Inserted to irus_sushilogtmp table"); -/* - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); - logger.info("Inserting into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); -*/ - logger.info("Inserting to sushilog table"); - String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(insertToShushilog); - logger.info("Inserted to sushilog table"); + ConnectDB.getHiveConnection().close(); + } - ConnectDB.getHiveConnection().close(); - } + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); - public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) - + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; - logger.info("(getIrusRRReport) Getting report: " + reportUrl); + logger.info("(getIrusRRReport) Getting report: " + reportUrl); - String text = getJson(reportUrl, "", ""); + String text = getJson(reportUrl, "", ""); - List opendoarsToVisit = new ArrayList(); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - int i = 0; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); - for (Object identifier : itemIdentifier) { - JSONObject opendoar = (JSONObject) identifier; - if (opendoar.get("Type").toString().equals("OpenDOAR")) { - i++; - opendoarsToVisit.add(opendoar.get("Value").toString()); - break; - } - } - // break; - } + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray != null) { + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } - logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); - if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 - && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - } + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 + && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } - logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); - for (String opendoar : opendoarsToVisit) { - logger.info("Now working on openDoar: " + opendoar); - this.getIrusIRReport(opendoar, irusUKReportPath); - } + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } else { + logger.info("IRUS Reports not found for day"); + } - logger.info("(getIrusRRReport) Finished with report: " + reportUrl); - } + } - private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { - logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); - ConnectDB.getHiveConnection().setAutoCommit(false); + ConnectDB.getHiveConnection().setAutoCommit(false); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + // Setting the ending period (last day of the month) + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, "opendoar____::" + opendoar); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - int batch_size = 0; +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); - } else { - while (start.before(end)) { - logger.info("date: " + simpleDateFormat.format(start.getTime())); - String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) - + "&RepositoryIdentifier=opendoar%3A" + opendoar - + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; - start.add(Calendar.MONTH, 1); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + int batch_size = 0; - logger.info("Downloading file: " + reportUrl); - String text = getJson(reportUrl, "", ""); - if (text == null) { - continue; - } + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); - FileSystem fs = FileSystem.get(new Configuration()); - String filePath = irusUKReportPath + "/" + "IrusIRReport_" - + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePath); - FSDataOutputStream fin = fs.create(new Path(filePath), true); + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - if (jsonArray == null) { - continue; - } - String oai = ""; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - fin.write(jsonObjectRow.toJSONString().getBytes()); - fin.writeChar('\n'); - } + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); - fin.close(); - } + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } - } - //ConnectDB.getHiveConnection().close(); + fin.close(); + } - logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); - } + } + // ConnectDB.getHiveConnection().close(); - private String getJson(String url) throws Exception { - try { - System.out.println("===> Connecting to: " + url); - URL website = new URL(url); - System.out.println("Connection url -----> " + url); - URLConnection connection = website.openConnection(); + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - System.out.println("response ====> " + response.toString()); + System.out.println("response ====> " + response.toString()); - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - private String getJson(String url, String username, String password) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL", e); - return null; - } - } + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java index 88550579b..904290af8 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -27,49 +28,49 @@ import org.slf4j.LoggerFactory; */ public class LaReferenciaDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); - public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - this.createTables(); + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); // this.createTmpTables(); - } + } - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " - + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); // String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " // + " ON INSERT TO lareferencialog " // + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," @@ -80,16 +81,16 @@ public class LaReferenciaDownloadLogs { // stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); // stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } // private void createTmpTables() throws Exception { // @@ -114,152 +115,159 @@ public class LaReferenciaDownloadLogs { // // System.exit(0); // } // } - private String getPiwikLogUrl() { - return piwikUrl + "/"; - } + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } - private String getJson(String url) throws Exception { - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); // response.append("\n"); - } - } + } + } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } - public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { - String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; - List siteIdsToVisit = new ArrayList(); + List siteIdsToVisit = new ArrayList(); - // Getting all the siteIds in a list for logging reasons & limiting the list - // to the max number of siteIds - content = getJson(baseApiUrl); - JSONParser parser = new JSONParser(); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); - } - logger.info("Found the following siteIds for download: " + siteIdsToVisit); + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - for (int siteId : siteIdsToVisit) { - logger.info("Now working on LaReferencia MatomoId: " + siteId); - this.GetLaReFerenciaLogs(repoLogsPath, siteId); - } - } + for (int siteId : siteIdsToVisit) { + logger.info("Now working on LaReferencia MatomoId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } - public void GetLaReFerenciaLogs(String repoLogsPath, - int laReferencialMatomoID) throws Exception { + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { - logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".lareferencialog WHERE matomoid=?"); - st.setInt(1, laReferencialMatomoID); - Date dateMax = null; + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=?"); + st.setInt(1, laReferencialMatomoID); + Date dateMax = null; - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - Date date = currDay.getTime(); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); - } else { - logger - .info( - "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " - + sdf.format(date)); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - outFolder = repoLogsPath; + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger + .info( + "Date found in logs " + dateMax + " and not downloanding Matomo logs for " + + laReferencialMatomoID); + } else { + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), - true); + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - int i = 0; + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path( + outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); - JSONParser parser = new JSONParser(); - do { - String apiUrl = baseApiUrl; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - fin.write(jsonObjectRaw.toJSONString().getBytes()); - fin.writeChar('\n'); - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - logger - .info( - "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID - + " and for " - + sdf.format(date)); - i++; - } while (true); - fin.close(); - } - } - } + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + } + } + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java index c20781767..bcf1711cb 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java @@ -61,15 +61,6 @@ public class LaReferenciaStats { "stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTableLareferenciaLog); logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); stmt.close(); ConnectDB.getHiveConnection().close(); @@ -82,30 +73,6 @@ public class LaReferenciaStats { } } -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - public void processLogs() throws Exception { try { logger.info("Processing LaReferencia repository logs"); @@ -116,16 +83,7 @@ public class LaReferenciaStats { removeDoubleClicks(); logger.info("LaReferencia removed double clicks"); -/******** - logger.info("LaReferencia creating viewsStats"); - viewsStats(); - logger.info("LaReferencia created viewsStats"); - logger.info("LaReferencia creating downloadsStats"); - downloadsStats(); - logger.info("LaReferencia created downloadsStats"); - -************/ - logger.info("LaReferencia updating Production Tables"); + logger.info("LaReferencia updating Production Tables"); updateProdTables(); logger.info("LaReferencia updated Production Tables"); @@ -255,88 +213,6 @@ public class LaReferenciaStats { // conn.close(); } - public void viewsStats() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating la_result_views_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS " - + - "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + - "THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='action' and " + - "(source_item_type='oaItem' or source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; - stmt.executeUpdate(sql); - logger.info("Created la_result_views_monthly_tmp view"); - - logger.info("Dropping la_views_stats_tmp table"); - sql = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".la_views_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Dropped la_views_stats_tmp table"); - - logger.info("Creating la_views_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " + - "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.oid AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(sql); - logger.info("Created la_views_stats_tmp table"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - - private void downloadsStats() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating la_result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".la_result_downloads_monthly_tmp AS " + - "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + - "THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp where action='download' and " + - "(source_item_type='oaItem' or source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; - stmt.executeUpdate(sql); - logger.info("Created la_result_downloads_monthly_tmp view"); - - logger.info("Dropping la_downloads_stats_tmp table"); - sql = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".la_downloads_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Dropped la_downloads_stats_tmp table"); - - logger.info("Creating la_downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " + - "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.oid AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month"; - stmt.executeUpdate(sql); - logger.info("Created la_downloads_stats_tmp table"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } - private void updateProdTables() throws SQLException, Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); @@ -346,40 +222,11 @@ public class LaReferenciaStats { String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog " + "select * from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; stmt.executeUpdate(sql); -/***** - logger.info("Updating views_stats"); - sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; - stmt.executeUpdate(sql); -// sql = "insert into public.views_stats select * from la_views_stats_tmp;"; -// stmt.executeUpdate(sql); - - logger.info("Updating downloads_stats"); - sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "select * from " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to usage_stats from lareferencia"); - sql = "INSERT INTO "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(sql); - logger.info("Inserted data to usage_stats from lareferencia"); -// sql = "insert into public.downloads_stats select * from la_downloads_stats_tmp;"; -// stmt.executeUpdate(sql); -****/ logger.info("Dropping lareferencialogtmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp"; - logger.info("Dropped lareferencialogtmp"); - stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogtmp"); + stmt.executeUpdate(sql); stmt.close(); ConnectDB.getHiveConnection().close(); diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java index 5cc9ec563..a84d6743f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -1,9 +1,12 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; import java.net.Authenticator; import java.net.URL; import java.net.URLConnection; +import java.nio.file.Files; +import java.nio.file.Paths; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.Statement; @@ -30,299 +33,299 @@ import org.slf4j.LoggerFactory; */ public class PiwikDownloadLogs { - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; - /* + /* * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; - private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); - public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; - } + } - private String getPiwikLogUrl() { - return "https://" + piwikUrl + "/"; - } + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } - private String getJson(String url) throws Exception { - try { - logger.debug("Connecting to download the JSON: " + url); - URL website = new URL(url); - URLConnection connection = website.openConnection(); + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + url + " Exception: " + e); - throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); - } - } + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } - class WorkerThread implements Runnable { + class WorkerThread implements Runnable { - private Calendar currDay; - private int siteId; - private String repoLogsPath; - private String portalLogPath; - private String portalMatomoID; + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; - public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws IOException { - this.currDay = (Calendar) currDay.clone(); - this.siteId = new Integer(siteId); - this.repoLogsPath = new String(repoLogsPath); - this.portalLogPath = new String(portalLogPath); - this.portalMatomoID = new String(portalMatomoID); - } + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } - public void run() { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - System.out - .println( - Thread.currentThread().getName() + " (Start) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - try { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - System.out - .println( - Thread.currentThread().getName() + " (End) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - int i = 0; + int i = 0; - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - writtenBytes += jsonObjectRawBytes.length + 1; - } + writtenBytes += jsonObjectRawBytes.length + 1; + } - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - i++; - } while (true); + i++; + } while (true); - fs.close(); - } - } + fs.close(); + } + } - public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { - Statement statement = ConnectDB.getHiveConnection().createStatement(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - ResultSet rs = statement - .executeQuery( - "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); - // Getting all the piwikids in a list for logging reasons & limitting the list - // to the max number of piwikids - List piwikIdToVisit = new ArrayList(); - //while (rs.next()) - //piwikIdToVisit.add(rs.getInt(1)); - piwikIdToVisit.add(13); - piwikIdToVisit.add(109); - - logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); + while (rs.next()) { + piwikIdToVisit.add(rs.getInt(1)); + } + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { - logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } - logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); - // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); + // Setting the ending period (last day of the month) + // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + // end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); - logger.info("Now working on piwikId: " + siteId); + logger.info("Now working on piwikId: " + siteId); - PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog WHERE source=?"); - st.setInt(1, siteId); - Date dateMax = null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - // logger.info("Date used " + currDay.toString()); - // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - // executor.execute(worker);// calling execute method of ExecutorService - logger.info("Date used " + currDay.getTime().toString()); + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); - } else { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } - } - } - // executor.shutdown(); - // while (!executor.isTerminated()) { - // } - // System.out.println("Finished all threads"); - } + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; - int i = 0; + int i = 0; - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); - writtenBytes += jsonObjectRawBytes.length + 1; - } + writtenBytes += jsonObjectRawBytes.length + 1; + } - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); - i++; - } while (true); + i++; + } while (true); - fs.close(); - } + fs.close(); + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java index 4903d9599..9144620b7 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -60,7 +60,7 @@ public class PiwikStatsDB { this.createTables(); // The piwiklog table is not needed since it is built // on top of JSON files - ////////////this.createTmpTables(); + //////////// this.createTmpTables(); } public ArrayList getRobotsList() { @@ -86,6 +86,7 @@ public class PiwikStatsDB { logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; stmt.executeUpdate(dropDatabase); + } catch (Exception e) { logger.error("Failed to drop database: " + e); throw new Exception("Failed to drop database: " + e.toString(), e); @@ -117,10 +118,15 @@ public class PiwikStatsDB { + "into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTablePiwikLog); +// String dropT = "TRUNCATE TABLE " +// + ConnectDB.getUsageStatsDBSchema() +// + ".piwiklog "; +// stmt.executeUpdate(dropT); +// logger.info("truncated piwiklog"); + ///////////////////////////////////////// // Rule for duplicate inserts @ piwiklog ///////////////////////////////////////// - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " @@ -131,7 +137,6 @@ public class PiwikStatsDB { ////////////////////////////////////////////////// // Rule for duplicate inserts @ process_portal_log ////////////////////////////////////////////////// - stmt.close(); ConnectDB.getHiveConnection().close(); @@ -141,47 +146,6 @@ public class PiwikStatsDB { } } -/***** public void createTmpTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePiwikLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ piwiklogtmp - ////////////////////////////////////////////////// - - ////////////////////////////////////////////////// - // Copy from public.piwiklog to piwiklog - ////////////////////////////////////////////////// - // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; - // stmt.executeUpdate(sqlCopyPublicPiwiklog); - - String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log_tmp - ////////////////////////////////////////////////// - - stmt.close(); - - } catch (Exception e) { - logger.error("Failed to create tmptables: " + e); - throw new Exception("Failed to create tmp tables: " + e.toString(), e); - // System.exit(0); - } - } -******/ public void processLogs() throws Exception { try { ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); @@ -203,23 +167,17 @@ public class PiwikStatsDB { processPortalLog(); logger.info("Portal logs process done"); - logger.info("Processing portal usagestats"); - portalStats(); + logger.info("Processing portal usagestats"); + portalLogs(); logger.info("Portal usagestats process done"); -/***** - logger.info("ViewsStats processing starts"); - viewsStats(); - logger.info("ViewsStats processing ends"); - - logger.info("DownloadsStats processing starts"); - downloadsStats(); - logger.info("DownloadsStats processing starts"); -*****/ logger.info("Updating Production Tables"); updateProdTables(); logger.info("Updated Production Tables"); + logger.info("Create Pedocs Tables"); + createPedocsOldUsageData(); + logger.info("Pedocs Tables Created"); } catch (Exception e) { logger.error("Failed to process logs: " + e); @@ -237,65 +195,65 @@ public class PiwikStatsDB { logger.info("Added JSON Serde jar"); logger.info("Dropping piwiklogtmp_json table"); - String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json"; + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; stmt.executeUpdate(drop_piwiklogtmp_json); logger.info("Dropped piwiklogtmp_json table"); logger.info("Creating piwiklogtmp_json"); - String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >\n" + - " >,\n" + - " timestamp: String\n" + - " >\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >\n" + + " >,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_piwiklogtmp_json); logger.info("Created piwiklogtmp_json"); logger.info("Dropping piwiklogtmp table"); - String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp"; + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; stmt.executeUpdate(drop_piwiklogtmp); logger.info("Dropped piwiklogtmp"); logger.info("Creating piwiklogtmp"); - String create_piwiklogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(create_piwiklogtmp); logger.info("Created piwiklogtmp"); logger.info("Inserting into piwiklogtmp"); - String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent\n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; stmt.executeUpdate(insert_piwiklogtmp); logger.info("Inserted into piwiklogtmp"); @@ -308,33 +266,31 @@ public class PiwikStatsDB { logger.info("Cleaning download double clicks"); // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "WHERE EXISTS (\n" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" - + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + - "AND p1.timestamp\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; + String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json(" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_process_portal_log_tmp_json); logger.info("Created process_portal_log_tmp_json"); logger.info("Droping process_portal_log_tmp table"); - String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp"; + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; stmt.executeUpdate(drop_process_portal_log_tmp); logger.info("Dropped process_portal_log_tmp"); logger.info("Creating process_portal_log_tmp"); - String create_process_portal_log_tmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(create_process_portal_log_tmp); logger.info("Created process_portal_log_tmp"); logger.info("Inserting into process_portal_log_tmp"); String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " - + - "actiondetail.url as url, " + - "CASE\n" + - " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + - " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " - + - " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + - " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + - " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + - " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + - " ELSE '' " + - "END AS entity_id, " + - "CASE " + - " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + - " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + - " ELSE '' " + - "END AS source_item_type, " + - "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + - "browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; stmt.executeUpdate(insert_process_portal_log_tmp); logger.info("Inserted into process_portal_log_tmp"); stmt.close(); } - public void portalStats() throws SQLException { + public void portalLogs() throws SQLException { Connection con = ConnectDB.getHiveConnection(); Statement stmt = con.createStatement(); con.setAutoCommit(false); -// Original queries where of the style -// -// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp2, -// openaire_prod_stats_20200821.result_oids roid -// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null -// -// The following query is an example of how queries should be -// -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); -// -// We should consider if we would like the queries to be as the following -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND -// roid.oid != ''); - logger.info("PortalStats - Step 1"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".result_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); logger.info("PortalStats - Step 2"); stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); @@ -494,12 +421,11 @@ public class PiwikStatsDB { */ logger.info("PortalStats - Step 3"); stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + ".project_oids roid WHERE roid.id IS NOT NULL)"; stmt.executeUpdate(sql); stmt.close(); @@ -512,233 +438,233 @@ public class PiwikStatsDB { logger.info("Cleaning oai - Step 1"); stmt = ConnectDB.getHiveConnection().createStatement(); - String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + - "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 2"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + - "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 3"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + - "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 4"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + - "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 5"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + - "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 6"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + - "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 7"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + - "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 8"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + - "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 9"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + - "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 10"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + - "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 11"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + - "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 12"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + - "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 13"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + - "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 14"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + - "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 15"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + - "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 16"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + - "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 17"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + - "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 18"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + - "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 19"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + - "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 20"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + - "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 21"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + - "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 22"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + - "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 23"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + - "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 24"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + - "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 25"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + - "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 26"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + - "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 27"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + - "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 28"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + - "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; stmt.executeUpdate(sql); stmt.close(); logger.info("Cleaning oai - Step 29"); stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + - "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; stmt.executeUpdate(sql); stmt.close(); @@ -746,63 +672,83 @@ public class PiwikStatsDB { ConnectDB.getHiveConnection().close(); } - private String processPortalURL(String url) { - - if (url.indexOf("explore.openaire.eu") > 0) { - try { - url = URLDecoder.decode(url, "UTF-8"); - } catch (Exception e) { - logger.info("Error when decoding the following URL: " + url); - } - if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); - } else if (url.indexOf("datasource=") > 0 - && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { - url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); - } else if (url.indexOf("datasourceFilter=") > 0 - && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); - } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); - } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); - } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 - && !url.contains("oai:dnet:corda")) { - url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); - } else if (url.indexOf("organizationId=") > 0 - && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { - url = "organization|" - + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); - } else { - url = ""; - } - } else { - url = ""; - } - - return url; - } - private void updateProdTables() throws SQLException { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Inserting data to piwiklog"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; stmt.executeUpdate(sql); - + logger.info("Dropping piwiklogtmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; stmt.executeUpdate(sql); - logger.info("Dropped piwiklogtmp"); - + logger.info("Dropped piwiklogtmp"); + logger.info("Dropping process_portal_log_tmp"); sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); - logger.info("Dropped process_portal_log_tmp"); + logger.info("Dropped process_portal_log_tmp"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + public void finalizeStats() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping piwiklogtmp"); + String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp"); + + logger.info("Dropping process_portal_log_tmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Dropping irus_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp"); + + logger.info("Dropping irus_sushilogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp_json"); + + logger.info("Dropping lareferencialogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogtmp_json"); + + logger.info("Dropping piwiklogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp_json"); + + logger.info("Dropping process_portal_log_tmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp_json"); + + logger.info("Dropping sarc_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp"); + + logger.info("Dropping sarc_sushilogtmp_json_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_array"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_non_array"); stmt.close(); ConnectDB.getHiveConnection().close(); @@ -868,4 +814,22 @@ public class PiwikStatsDB { private Connection getConnection() throws SQLException { return ConnectDB.getHiveConnection(); } + + public void createPedocsOldUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating PeDocs Old Views Table"); + String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsoldviews as select * from default.pedocsviews"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Views Table created"); + + logger.info("Creating PeDocs Old Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsolddownloads as select * from default.pedocsdownloads"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Downloads Table created"); + + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java index 4ca20c52e..e85c972f5 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagerawdata.export; import java.io.*; @@ -33,543 +34,467 @@ import org.slf4j.LoggerFactory; */ public class SarcStats { - private Statement stmtHive = null; - private Statement stmtImpala = null; + private Statement stmtHive = null; + private Statement stmtImpala = null; - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - public SarcStats() throws Exception { + public SarcStats() throws Exception { // createTables(); - } + } - private void createTables() throws Exception { - try { + private void createTables() throws Exception { + try { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } - public void reCreateLogDirs() throws IOException { - FileSystem dfs = FileSystem.get(new Configuration()); + public void reCreateLogDirs() throws IOException { + FileSystem dfs = FileSystem.get(new Configuration()); - logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); + logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); - logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); + logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); - logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); + logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); - logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); - } + logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); + } - public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); - logger.info("Dropping sarc_sushilogtmp_json_array table"); - String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); - logger.info("Dropped sarc_sushilogtmp_json_array table"); + logger.info("Dropping sarc_sushilogtmp_json_array table"); + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); + logger.info("Dropped sarc_sushilogtmp_json_array table"); - logger.info("Creating sarc_sushilogtmp_json_array table"); - String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - logger.info("Created sarc_sushilogtmp_json_array table"); + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); - logger.info("Dropping sarc_sushilogtmp_json_non_array table"); - String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp_json_non_array table"); - String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" - + " `ItemIdentifier` struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathNonArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - logger.info("Created sarc_sushilogtmp_json_non_array table"); + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); - logger.info("Creating sarc_sushilogtmp table"); - String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(create_sarc_sushilogtmp); - logger.info("Created sarc_sushilogtmp table"); + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "WHERE `ItemIdent`.`Type`='DOI'"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - ConnectDB.getHiveConnection().close(); - } + ConnectDB.getHiveConnection().close(); + } - public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); - logger.info("Dropping sarc_sushilogtmp table"); - String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp"; - stmt.executeUpdate(drop_sarc_sushilogtmp); - logger.info("Dropped sarc_sushilogtmp table"); - ConnectDB.getHiveConnection().close(); + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); - List issnAndUrls = new ArrayList(); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" - }); - issnAndUrls.add(new String[]{ - "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" - }); + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[] { + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); - if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 - && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); - issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); - } + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 + && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } - logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); - for (String[] issnAndUrl : issnAndUrls) { - logger.info("Now working on ISSN: " + issnAndUrl[1]); - getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); - } + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } - } + } - public void finalizeSarcStats() throws Exception { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - stmtImpala = ConnectDB.getImpalaConnection().createStatement(); -/* - logger.info("Creating downloads_stats table_tmp"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats_tmp table"); + public void updateSarcLogs() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - logger.info("Dropping sarc_sushilogtmp_impala table"); - String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala"; - stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); - logger.info("Dropped sarc_sushilogtmp_impala table"); + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); - logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); - String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala " - + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(createSarcSushilogtmpImpala); - logger.info("Created sarc_sushilogtmp_impala"); + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } - logger.info("Making sarc_sushilogtmp visible to impala"); - String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala;"; - stmtImpala.executeUpdate(invalidateMetadata); + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Dropping downloads_stats_impala table"); - String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala"; - stmtHive.executeUpdate(drop_downloads_stats_impala); - logger.info("Dropped downloads_stats_impala table"); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - logger.info("Making downloads_stats_impala deletion visible to impala"); - try { - String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala;"; - stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); - } catch (SQLException sqle) { - } + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); - // We run the following query in Impala because it is faster - logger.info("Creating downloads_stats_impala"); - String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala AS " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmtImpala.executeUpdate(createDownloadsStatsImpala); - logger.info("Creating downloads_stats_impala"); + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp SELECT * " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; - stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats_tmp"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmtHive.executeUpdate(createSushilog); - logger.info("Created sushilog table"); -*/ - // Insert into sushilog - logger.info("Inserting into sushilog"); - String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(insertSushiLog); - logger.info("Inserted into sushilog"); + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - } + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); - public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, - String url, String issn) throws Exception { - logger.info("Processing SARC! issn: " + issn + " with url: " + url); - ConnectDB.getHiveConnection().setAutoCommit(false); + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, issn); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); - // Creating the needed configuration for the correct storing of data - Configuration config = new Configuration(); - config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); - config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); - config - .set( - "fs.hdfs.impl", - org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - config - .set( - "fs.file.impl", - org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem dfs = FileSystem.get(config); + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); - } else { - - while (start.before(end)) { - String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); - start.add(Calendar.MONTH, 1); + for (Object aJsonArray : jsonArray) { - logger.info("(getARReport) Getting report: " + reportUrl); - String text = getJson(reportUrl); - if (text == null) { - continue; - } + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = null; - try { - jsonObject = (JSONObject) parser.parse(text); - } // if there is a parsing error continue with the next url - catch (ParseException pe) { - continue; - } + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } - jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("sc:Report"); - if (jsonObject == null) { - continue; - } - jsonObject = (JSONObject) jsonObject.get("c:Report"); - jsonObject = (JSONObject) jsonObject.get("c:Customer"); - Object obj = jsonObject.get("c:ReportItems"); - JSONArray jsonArray = new JSONArray(); - if (obj instanceof JSONObject) { - jsonArray.add(obj); - } else { - jsonArray = (JSONArray) obj; - // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); - } - if (jsonArray == null) { - continue; - } + finArray.close(); + finNonArray.close(); - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathArray); - FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) { + fileArray.delete(); + } + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) { + fileNonArray.delete(); + } - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathNonArray); - FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + } - for (Object aJsonArray : jsonArray) { + dfs.close(); + } + // ConnectDB.getHiveConnection().close(); + } - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - renameKeysRecursively(":", jsonObjectRow); - - if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { - finNonArray.write(jsonObjectRow.toJSONString().getBytes()); - finNonArray.writeChar('\n'); - } else { - finArray.write(jsonObjectRow.toJSONString().getBytes()); - finArray.writeChar('\n'); - } - } - - finArray.close(); - finNonArray.close(); - - // Check the file size and if it is too big, delete it - File fileArray = new File(filePathArray); - if (fileArray.length() == 0) - fileArray.delete(); - File fileNonArray = new File(filePathNonArray); - if (fileNonArray.length() == 0) - fileNonArray.delete(); - - } - - dfs.close(); - } - //ConnectDB.getHiveConnection().close(); - } - - private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { - for (Object jjval : givenJsonObj) { - if (jjval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jjval); - } else if (jjval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jjval); - } // All other types of vals - else + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jjval); + } else if (jjval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jjval); + } // All other types of vals + else ; - } - } + } + } - private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { - Set jkeys = new HashSet(givenJsonObj.keySet()); - for (String jkey : jkeys) { + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { - String[] splitArray = jkey.split(delimiter); - String newJkey = splitArray[splitArray.length - 1]; + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; - Object jval = givenJsonObj.get(jkey); - givenJsonObj.remove(jkey); - givenJsonObj.put(newJkey, jval); + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); - if (jval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jval); - } + if (jval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jval); + } - if (jval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jval); - } - } - } + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } - private String getJson(String url) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { - // Logging error and silently continuing - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); // return null; // throw new Exception("Failed to get URL: " + e.toString(), e); - } - return ""; - } + } + return ""; + } } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java index bf2187569..07e15605f 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -13,7 +13,7 @@ import org.slf4j.LoggerFactory; /** * Main class for downloading and processing Usage statistics - * + * * @author D. Pierrakos, S. Zoupanos */ public class UsageStatsExporter { @@ -51,19 +51,13 @@ public class UsageStatsExporter { logger.info("Initialising DB properties"); ConnectDB.init(); -// runImpalaQuery(); - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables){ + if (ExecuteWorkflow.recreateDbAndTables) { piwikstatsdb.recreateDBAndTables(); - logger.info("DB-Tables-TmpTables are created "); - } -// else { -// piwikstatsdb.createTmpTables(); -// logger.info("TmpTables are created "); -// } + logger.info("DB-Tables-TmpTables are created "); + } logger.info("Initializing the download logs module"); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); @@ -106,9 +100,8 @@ public class UsageStatsExporter { lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } - - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); @@ -116,7 +109,6 @@ public class UsageStatsExporter { logger.info("LaReferencia logs done"); } - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); @@ -132,14 +124,11 @@ public class UsageStatsExporter { irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } - - if (ExecuteWorkflow.irusProcessStats) { + if (ExecuteWorkflow.irusProcessStats) { irusstats.processIrusStats(); logger.info("Irus done"); } - - SarcStats sarcStats = new SarcStats(); if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); @@ -148,51 +137,70 @@ public class UsageStatsExporter { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); } - - if (ExecuteWorkflow.sarcProcessStats) { + if (ExecuteWorkflow.sarcProcessStats) { sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); + sarcStats.updateSarcLogs(); } logger.info("Sarc done"); - - -/* // finalize usagestats + + logger.info("Dropping tmp tables"); if (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); + logger.info("Dropped tmp tables"); } -*/ -/* - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } -*/ - - logger.info("End"); + logger.info("Raw Data Download End"); } - private void invalidateMetadata() throws SQLException { - Statement stmt = null; + public void createdDBWithTablesOnly() throws Exception { + logger.info("Initialising DB properties"); + ConnectDB.init(); - stmt = ConnectDB.getImpalaConnection().createStatement(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + piwikstatsdb.recreateDBAndTables(); - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + piwikstatsdb.createPedocsOldUsageData(); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); + + logger.info("Creating sushilog"); + + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + logger.info("Updating piwiklog"); + String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog select * from openaire_prod_usage_raw.piwiklog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + logger.info("Updating lareferencialog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + logger.info("Updating sushilog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog select * from openaire_prod_usage_raw.sushilog"; stmt.executeUpdate(sql); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); + stmt.close(); ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } + } diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json index 988c23b48..1aa5ad6f8 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -125,12 +125,6 @@ "paramDescription": "Starting log period", "paramRequired": true }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, { "paramName": "npidd", "paramLongName": "numberOfPiwikIdsToDownload", @@ -216,12 +210,6 @@ "paramDescription": "Create the usage_stats table?", "paramRequired": true }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, { "paramName": "nodt", "paramLongName": "numberOfDownloadThreads", diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml index a6600516d..022a107ab 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -63,7 +63,6 @@ --downloadPiwikLogs${downloadPiwikLogs} --processPiwikLogs${processPiwikLogs} --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} --laReferenciaEmptyDirs${laReferenciaEmptyDirs} @@ -78,7 +77,6 @@ --sarcProcessStats${sarcProcessStats} --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} - --finalTablesVisibleToImpala${finalTablesVisibleToImpala} --numberOfDownloadThreads${numberOfDownloadThreads} diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index f400239f5..79fabb603 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -23,7 +23,35 @@ 4.0.0 dhp-usage-stats-build - + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + UTF-8 UTF-8 diff --git a/dhp-workflows/dhp-usage-stats-build/runworkflow.sh b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh new file mode 100755 index 000000000..191fb24c6 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java index 8f0f8eae7..e53709f1a 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -3,12 +3,17 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; import java.util.Properties; import org.apache.log4j.Logger; @@ -23,108 +28,120 @@ import com.mchange.v2.c3p0.ComboPooledDataSource; public abstract class ConnectDB { - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String usageRawDataDBSchema; - private static String usageStatsDBSchema; - private static String statsDBSchema; - private final static Logger log = Logger.getLogger(ConnectDB.class); + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageRawDataDBSchema; + private static String usageStatsDBSchema; + private static String usagestatsPermanentDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); - static void init() throws ClassNotFoundException { + static void init() throws ClassNotFoundException { - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema; - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); - return DB_HIVE_CONNECTION; - } - } + return DB_HIVE_CONNECTION; + } + } - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); - return DB_IMPALA_CONNECTION; - } - } + return DB_IMPALA_CONNECTION; + } + } - public static String getUsageRawDataDBSchema() { - return usageRawDataDBSchema; - } + public static String getUsageRawDataDBSchema() { + return ConnectDB.usageRawDataDBSchema; + } - public static String getUsageStatsDBSchema() { - return ConnectDB.usageStatsDBSchema; - } + public static String getUsageStatsDBSchema() { + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } + return ConnectDB.usageStatsDBSchema + "_" + todayAsString; + } - private static Connection connectHive() throws SQLException { - /* + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + public static String getUsagestatsPermanentDBSchema() { + return ConnectDB.usagestatsPermanentDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(30); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); - } + } - private static Connection connectImpala() throws SQLException { - /* + private static Connection connectImpala() throws SQLException { + /* * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); - cpds.setAcquireRetryAttempts(30); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); + return cpds.getConnection(); - } + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java index 3f958abba..26e44b1f6 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java @@ -3,6 +3,7 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.text.SimpleDateFormat; @@ -11,162 +12,142 @@ import java.util.Date; import org.apache.commons.io.IOUtils; import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** * @author D. Pierrakos, S. Zoupanos */ public class ExecuteWorkflow { - static String matomoAuthToken; - static String matomoBaseURL; - static String repoLogPath; - static String portalLogPath; - static String portalMatomoID; - static String irusUKBaseURL; - static String irusUKReportPath; - static String sarcsReportPathArray; - static String sarcsReportPathNonArray; - static String lareferenciaLogPath; - static String lareferenciaBaseURL; - static String lareferenciaAuthToken; - static String dbHiveUrl; - static String dbImpalaUrl; - static String usageRawDataDBSchema; - static String usageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; +// static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; +// static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; +// static String lareferenciaBaseURL; +// static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageRawDataDBSchema; + static String usageStatsDBSchema; + static String usagestatsPermanentDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; - static boolean piwikEmptyDirs; - static boolean downloadPiwikLogs; - static boolean processPiwikLogs; + static boolean processPiwikLogs; + static boolean processLaReferenciaLogs; - static Calendar startingLogPeriod; - static Calendar endingLogPeriod; - static int numberOfPiwikIdsToDownload; - static int numberOfSiteIdsToDownload; + static boolean irusProcessStats; - static boolean laReferenciaEmptyDirs; - static boolean downloadLaReferenciaLogs; - static boolean processLaReferenciaLogs; + static boolean sarcProcessStats; - static boolean irusCreateTablesEmptyDirs; - static boolean irusDownloadReports; - static boolean irusProcessStats; - static int irusNumberOfOpendoarsToDownload; + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; - static boolean sarcCreateTablesEmptyDirs; - static boolean sarcDownloadReports; - static boolean sarcProcessStats; - static int sarcNumberOfIssnToDownload; + static int numberOfDownloadThreads; - static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - static int numberOfDownloadThreads; + public static void main(String args[]) throws Exception { - private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + // Sending the logs to the console + BasicConfigurator.configure(); - public static void main(String args[]) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); + parser.parseArgument(args); - // Sending the logs to the console - BasicConfigurator.configure(); + // Setting up the initial parameters +// matomoAuthToken = parser.get("matomoAuthToken"); +// matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); +// irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); +// lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); +// lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); - parser.parseArgument(args); + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); - // Setting up the initial parameters - matomoAuthToken = parser.get("matomoAuthToken"); - matomoBaseURL = parser.get("matomoBaseURL"); - repoLogPath = parser.get("repoLogPath"); - portalLogPath = parser.get("portalLogPath"); - portalMatomoID = parser.get("portalMatomoID"); - irusUKBaseURL = parser.get("irusUKBaseURL"); - irusUKReportPath = parser.get("irusUKReportPath"); - sarcsReportPathArray = parser.get("sarcsReportPathArray"); - sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); - lareferenciaLogPath = parser.get("lareferenciaLogPath"); - lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); - lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { + processPiwikLogs = true; + } else { + processPiwikLogs = false; + } - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); - usageStatsDBSchema = parser.get("usageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); +// String startingLogPeriodStr = parser.get("startingLogPeriod"); +// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); +// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); +// +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { - processPiwikLogs = true; - } else { - processPiwikLogs = false; - } + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { + recreateDbAndTables = true; + } else { + recreateDbAndTables = false; + } - String startingLogPeriodStr = parser.get("startingLogPeriod"); - Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); - startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { + processLaReferenciaLogs = true; + } else { + processLaReferenciaLogs = false; + } - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { + irusProcessStats = true; + } else { + irusProcessStats = false; + } - numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); - numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { + sarcProcessStats = true; + } else { + sarcProcessStats = false; + } - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { - recreateDbAndTables = true; - } else { - recreateDbAndTables = false; - } + if (parser.get("finalizeStats").toLowerCase().equals("true")) { + finalizeStats = true; + } else { + finalizeStats = false; + } + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { + finalTablesVisibleToImpala = true; + } else { + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + } - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { - processLaReferenciaLogs = true; - } else { - processLaReferenciaLogs = false; - } + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } - if (parser.get("irusProcessStats").toLowerCase().equals("true")) { - irusProcessStats = true; - } else { - irusProcessStats = false; - } + private static Calendar startingLogPeriodStr(Date date) { - irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { - sarcProcessStats = true; - } else { - sarcProcessStats = false; - } - sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); - - if (parser.get("finalizeStats").toLowerCase().equals("true")) { - finalizeStats = true; - } else { - finalizeStats = false; - } - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { - finalTablesVisibleToImpala = true; - } else { - numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); - } - - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - - private static Calendar startingLogPeriodStr(Date date) { - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(date); - return calendar; - - } + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java index 4f34adc04..4439f848e 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.*; @@ -27,45 +28,42 @@ import org.slf4j.LoggerFactory; */ public class IrusStats { - private String irusUKURL; + private String irusUKURL; - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - public IrusStats() throws Exception { - } + public IrusStats() throws Exception { + } - - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + logger.info("Creating irus_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".irus_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created irus_downloads_stats_tmp table"); - logger.info("Creating irus_downloads_stats_tmp table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".irus_downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created irus_downloads_stats_tmp table"); + logger.info("Inserting into irus_downloads_stats_tmp"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into irus_downloads_stats_tmp"); - logger.info("Inserting into irus_downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into irus_downloads_stats_tmp"); + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } - stmt.close(); - //ConnectDB.getHiveConnection().close(); - } - - } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java index ea3ac5948..0d34ebef3 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java @@ -41,8 +41,6 @@ public class LaReferenciaStats { public LaReferenciaStats() throws Exception { } - - public void processLogs() throws Exception { try { logger.info("LaReferencia creating viewsStats"); @@ -62,7 +60,6 @@ public class LaReferenciaStats { } } - public void viewsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); @@ -101,7 +98,7 @@ public class LaReferenciaStats { logger.info("Created la_views_stats_tmp table"); stmt.close(); - ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } private void downloadsStats() throws Exception { @@ -142,8 +139,7 @@ public class LaReferenciaStats { logger.info("Created la_downloads_stats_tmp table"); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } - } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java index a165c6eab..253dc03b5 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -1,22 +1,15 @@ package eu.dnetlib.oa.graph.usagestatsbuild.export; -import java.io.*; -import java.net.URLDecoder; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.*; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.sql.Timestamp; /** * @author D. Pierrakos, S. Zoupanos @@ -29,37 +22,51 @@ public class PiwikStatsDB { private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - public PiwikStatsDB() throws Exception { } - public void recreateDBAndTables() throws Exception { this.createDatabase(); // The piwiklog table is not needed since it is built // on top of JSON files - ////////////this.createTmpTables(); + //////////// this.createTmpTables(); } private void createDatabase() throws Exception { + +// try { +// +// stmt = ConnectDB.getHiveConnection().createStatement(); +// +// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); +// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; +// stmt.executeUpdate(dropDatabase); +// } catch (Exception e) { +// logger.error("Failed to drop database: " + e); +// throw new Exception("Failed to drop database: " + e.toString(), e); +// } +// try { stmt = ConnectDB.getHiveConnection().createStatement(); + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema()); - logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); } try { stmt = ConnectDB.getHiveConnection().createStatement(); - logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); + logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); } catch (Exception e) { logger.error("Failed to create database: " + e); @@ -67,17 +74,16 @@ public class PiwikStatsDB { } } - public void processLogs() throws Exception { try { - logger.info("ViewsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); viewsStats(); - logger.info("ViewsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); - logger.info("DownloadsStats processing starts at: "+new Timestamp(System.currentTimeMillis())); + logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); downloadsStats(); - logger.info("DownloadsStats processing ends at: "+new Timestamp(System.currentTimeMillis())); + logger.info("DownloadsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); } catch (Exception e) { logger.error("Failed to process logs: " + e); @@ -85,68 +91,68 @@ public class PiwikStatsDB { } } - - public void viewsStats() throws Exception { Statement stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Dropping openaire_result_views_monthly_tmp view"); - String drop_result_views_monthly = "DROP VIEW IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_piwikresult_views_monthly_tmp"; + String drop_result_views_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_piwikresult_views_monthly_tmp"; stmt.executeUpdate(drop_result_views_monthly); logger.info("Dropped openaire_result_views_monthly_tmp view"); logger.info("Creating openaire_result_views_monthly_tmp view"); String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_result_views_monthly_tmp " + - "AS SELECT entity_id AS id, " + - "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + - "AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema() - + ".piwiklog where action='action' and (source_item_type='oaItem' or " + - "source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + - "source ORDER BY source, entity_id"; + + ".openaire_result_views_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + + "AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".piwiklog where action='action' and (source_item_type='oaItem' or " + + "source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; stmt.executeUpdate(create_result_views_monthly); logger.info("Created openaire_result_views_monthly_tmp table"); logger.info("Dropping openaire_views_stats_tmp table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_views_stats_tmp"; + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp"; stmt.executeUpdate(drop_views_stats); logger.info("Dropped openaire_views_stats_tmp table"); logger.info("Creating openaire_views_stats_tmp table"); String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_views_stats_tmp " + - "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + + ".openaire_views_stats_tmp " + + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_views_stats); logger.info("Created openaire_views_stats_tmp table"); logger.info("Creating openaire_pageviews_stats_tmp table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".openaire_pageviews_stats_tmp AS SELECT " + - "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + " AND p.source=d.piwik_id and p.id=ro.id \n" + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + + ".openaire_pageviews_stats_tmp AS SELECT " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(create_pageviews_stats); logger.info("Created pageviews_stats table"); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); } private void downloadsStats() throws Exception { @@ -154,152 +160,315 @@ public class PiwikStatsDB { ConnectDB.getHiveConnection().setAutoCommit(false); logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_result_downloads_monthly_tmp"; + String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp"; stmt.executeUpdate(drop_result_downloads_monthly); logger.info("Dropped openaire_result_downloads_monthly_tmp view"); logger.info("Creating openaire_result_downloads_monthly_tmp view"); - String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp " + - "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + - "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + - "FROM " + ConnectDB.getUsageRawDataDBSchema()+ ".piwiklog where action='download' " + - "AND (source_item_type='oaItem' OR source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + - "ORDER BY source, entity_id, month"; + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as downloads, " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' " + + "AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "ORDER BY source, entity_id, month"; stmt.executeUpdate(sql); logger.info("Created openaire_result_downloads_monthly_tmp view"); logger.info("Dropping openaire_downloads_stats_tmp table"); - String drop_views_stats = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".openaire_downloads_stats_tmp"; + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp"; stmt.executeUpdate(drop_views_stats); logger.info("Dropped openaire_downloads_stats_tmp table"); logger.info("Creating openaire_downloads_stats_tmp table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + - "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id and p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + - "ORDER BY d.id, ro.id, month "; + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; stmt.executeUpdate(sql); logger.info("Created downloads_stats table"); - logger.info("Dropping openaire_result_downloads_monthly_tmp view"); - sql = "DROP VIEW IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; logger.info("Dropped openaire_result_downloads_monthly_tmp view "); stmt.executeUpdate(sql); stmt.close(); - //ConnectDB.getHiveConnection().close(); + // ConnectDB.getHiveConnection().close(); + } + + public void uploadOldPedocs() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping Pedocs pedocs_views_stats_tmp table + logger.info("Dropping Pedocs pedocs_views_stats_tmp table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + logger.info("Dropped pedocs_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping Pedocs pedocs_downloads_stats table + logger.info("Dropping pedocs_downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats"; + logger.info("Dropped pedocs_downloads_stats table "); + stmt.executeUpdate(sql); + + // Creating Pedocs pedocs_views_stats_tmp table + logger.info("Creating Pedocs pedocs_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id,date,counter_abstract as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_views_stats_tmp table "); + + // Creating Pedocs pedocs_downloads_stats_tmp table + logger.info("Creating Pedocs pedocs_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id, date, counter as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_downloads_stats_tmp table "); + + } + + public void uploadTUDELFTStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_views_stats_tmp table + logger.info("Dropping TUDELFT tudelft_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + logger.info("Dropped tudelft_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_downloads_stats_tmp table + logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + logger.info("Dropped tudelft_downloads_stats_tmp table "); + stmt.executeUpdate(sql); + + // Creating TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_views_monthly_tmp view "); + + // Creating TUDELFT tudelft_views_stats_tmp table + logger.info("Creating TUDELFT tudelft_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_views_stats_tmp table"); + + // Creating TUDELFT tudelft_result_downloads_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_downloads_monthly_tmp view "); + + // Creating TUDELFT tudelft_downloads_stats_tmp table + logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_downloads_stats_tmp table"); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + } public void finalizeStats() throws Exception { stmt = ConnectDB.getHiveConnection().createStatement(); ConnectDB.getHiveConnection().setAutoCommit(false); - //Dropping views_stats table - logger.info("Dropping views_stats table"); - String sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + // Dropping views_stats table + logger.info("Dropping views_stats table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; logger.info("Dropped views_stats table "); stmt.executeUpdate(sql); - //Dropping downloads_stats table - logger.info("Dropping downloads_stats table"); - sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + // Dropping downloads_stats table + logger.info("Dropping downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; logger.info("Dropped downloads_stats table "); stmt.executeUpdate(sql); - //Dropping page_views_stats table - logger.info("Dropping pageviews_stats table"); - sql = "DROP TABLE IF EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + // Dropping page_views_stats table + logger.info("Dropping pageviews_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; logger.info("Dropped pageviews_stats table "); stmt.executeUpdate(sql); - //Creating views_stats table + // Dropping usage_stats table + logger.info("Dropping usage_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + logger.info("Dropped usage_stats table "); + stmt.executeUpdate(sql); + + // Creating views_stats table logger.info("Creating views_stats table"); - String createViewsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".views_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; + String createViewsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".views_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(createViewsStats); logger.info("Created views_stats table"); - - //Inserting OpenAIRE views stats - logger.info("Inserting Openaire data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("Openaire views updated to views_stats"); - //Inserting Lareferencia views stats - logger.info("Inserting LaReferencia data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + // Inserting OpenAIRE views stats + logger.info("Inserting Openaire data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; stmt.executeUpdate(sql); - logger.info("LaReferencia views updated to views_stats"); - + logger.info("Openaire views updated to views_stats"); + + // Inserting Pedocs old views stats + logger.info("Inserting Pedocs old data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Pedocs views updated to views_stats"); + + // Inserting TUDELFT views stats + logger.info("Inserting TUDELFT data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("TUDELFT views updated to views_stats"); + + // Inserting Lareferencia views stats + logger.info("Inserting LaReferencia data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("LaReferencia views updated to views_stats"); logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".downloads_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(createDownloadsStats); logger.info("Created downloads_stats table"); - //Inserting OpenAIRE downloads stats + // Inserting OpenAIRE downloads stats logger.info("Inserting OpenAIRE data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; stmt.executeUpdate(sql); + logger.info("Inserted OpenAIRE data to downloads_stats"); - //Inserting Lareferencia downloads stats + // Inserting Pedocs old downloads stats + logger.info("Inserting PeDocs old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted Pedocs data to downloads_stats"); + + // Inserting TUDELFT downloads stats + logger.info("Inserting TUDELFT old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted TUDELFT data to downloads_stats"); + + // Inserting Lareferencia downloads stats logger.info("Inserting LaReferencia data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; stmt.executeUpdate(sql); - logger.info("Lareferencia downloads updated to downloads_stats"); - - //Inserting IRUS downloads stats - logger.info("Inserting IRUS data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; - stmt.executeUpdate(sql); - logger.info("IRUS downloads updated to downloads_stats"); + logger.info("Lareferencia downloads updated to downloads_stats"); - //Inserting SARC-OJS downloads stats - logger.info("Inserting SARC data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + // Inserting IRUS downloads stats + logger.info("Inserting IRUS data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; stmt.executeUpdate(sql); - logger.info("SARC-OJS downloads updated to downloads_stats"); - - - logger.info("Creating pageviews_stats table"); + logger.info("IRUS downloads updated to downloads_stats"); + + // Inserting SARC-OJS downloads stats + logger.info("Inserting SARC data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("SARC-OJS downloads updated to downloads_stats"); + + logger.info("Creating pageviews_stats table"); String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".pageviews_stats " + - "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; + + ".pageviews_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; stmt.executeUpdate(create_pageviews_stats); logger.info("Created pageviews_stats table"); - - //Inserting OpenAIRE views stats from Portal + + // Inserting OpenAIRE views stats from Portal logger.info("Inserting data to page_views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; stmt.executeUpdate(sql); - + logger.info("Dropping full_dates table"); - String dropFullDates = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".full_dates"; + String dropFullDates = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".full_dates"; stmt.executeUpdate(dropFullDates); logger.info("Dropped full_dates table"); @@ -310,35 +479,80 @@ public class PiwikStatsDB { int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); logger.info("Creating full_dates table"); - sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + - "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + - "FROM (SELECT DATE '2016-01-01' AS from_date) p " + - "LATERAL VIEW " + - "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + + "FROM (SELECT DATE '2016-01-01' AS from_date) p " + + "LATERAL VIEW " + + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; stmt.executeUpdate(sql); logger.info("Created full_dates table"); - - logger.info("Inserting data to usage_stats"); - sql = "CREATE TABLE IF NOT EXISTS "+ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + - "SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(sql); - logger.info("Inserted data to usage_stats"); + logger.info("Inserting data to usage_stats"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + + "SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats"); + logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis())); + + logger.info("Dropping view views_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view views_stats on permanent usagestats DB"); + + logger.info("Create view views_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Created view views_stats on permanent usagestats DB"); + + logger.info("Dropping view pageviews_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view pageviews_stats on permanent usagestats DB"); + + logger.info("Create view pageviews_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Created view pageviews_stats on permanent usagestats DB"); + + logger.info("Dropping view downloads_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on downloads_stats on permanent usagestats DB"); + + logger.info("Create view on downloads_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on downloads_stats on permanent usagestats DB"); + + logger.info("Dropping view usage_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on usage_stats on permanent usagestats DB"); + + logger.info("Create view on usage_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on usage_stats on permanent usagestats DB"); + + logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis())); stmt.close(); ConnectDB.getHiveConnection().close(); } - private Connection getConnection() throws SQLException { return ConnectDB.getHiveConnection(); } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java index 2d224075f..880233f00 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.*; @@ -33,74 +34,74 @@ import org.slf4j.LoggerFactory; */ public class SarcStats { - private Statement stmtHive = null; - private Statement stmtImpala = null; + private Statement stmtHive = null; + private Statement stmtImpala = null; - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - public SarcStats() throws Exception { + public SarcStats() throws Exception { // createTables(); - } + } - private void createTables() throws Exception { - try { + private void createTables() throws Exception { + try { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } - public void processSarc() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); + public void processSarc() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); - logger.info("Creating sarc_downloads_stats_tmp table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created sarc_downloads_stats_tmp table"); + logger.info("Creating sarc_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created sarc_downloads_stats_tmp table"); - logger.info("Inserting into sarc_downloads_stats_tmp"); - String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmt.executeUpdate(insertSarcStats); - logger.info("Inserted into sarc_downloads_stats_tmp"); - - stmt.close(); - //ConnectDB.getHiveConnection().close(); - } + logger.info("Inserting into sarc_downloads_stats_tmp"); + String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + stmt.executeUpdate(insertSarcStats); + logger.info("Inserted into sarc_downloads_stats_tmp"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java index 43abb1681..47986f52a 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.oa.graph.usagestatsbuild.export; import java.io.IOException; @@ -17,90 +18,110 @@ import org.slf4j.LoggerFactory; */ public class UsageStatsExporter { - public UsageStatsExporter() { + public UsageStatsExporter() { - } + } - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - public void export() throws Exception { + public void export() throws Exception { - logger.info("Initialising DB properties"); - ConnectDB.init(); + logger.info("Initialising DB properties"); + ConnectDB.init(); // runImpalaQuery(); - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); - logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) { - piwikstatsdb.recreateDBAndTables(); - logger.info("DB-Tables are created "); - } + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) { + piwikstatsdb.recreateDBAndTables(); + logger.info("DB-Tables are created "); + } // else { // piwikstatsdb.createTmpTables(); // logger.info("TmpTables are created "); // } - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing Piwik logs"); + piwikstatsdb.processLogs(); + logger.info("Piwik logs Done"); + logger.info("Processing Pedocs Old Stats"); + piwikstatsdb.uploadOldPedocs(); + logger.info("Processing Pedocs Old Stats Done"); + logger.info("Processing TUDELFT Stats"); + piwikstatsdb.uploadTUDELFTStats(); + logger.info("Processing TUDELFT Stats Done"); - LaReferenciaStats lastats = new LaReferenciaStats(); + } - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(); - - if (ExecuteWorkflow.irusProcessStats) { - logger.info("Processing IRUS"); - irusstats.processIrusStats(); - logger.info("Irus done"); - } + LaReferenciaStats lastats = new LaReferenciaStats(); - SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(); - } - logger.info("Sarc done"); + IrusStats irusstats = new IrusStats(); - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } + if (ExecuteWorkflow.irusProcessStats) { + logger.info("Processing IRUS"); + irusstats.processIrusStats(); + logger.info("Irus done"); + } - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } + SarcStats sarcStats = new SarcStats(); - logger.info("End"); - } + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(); + } + logger.info("Sarc done"); - private void invalidateMetadata() throws SQLException { - Statement stmt = null; + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } - stmt = ConnectDB.getImpalaConnection().createStatement(); + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); + logger.info("End"); + } - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); + private void invalidateMetadata() throws SQLException { + Statement stmt = null; - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); + stmt = ConnectDB.getImpalaConnection().createStatement(); - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); - stmt.close(); - ConnectDB.getHiveConnection().close(); - } + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } } diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json index 3f121288e..407370ada 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json @@ -1,237 +1,128 @@ [ - { - "paramName": "mat", - "paramLongName": "matomoAuthToken", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "mbu", - "paramLongName": "matomoBaseURL", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "rlp", - "paramLongName": "repoLogPath", - "paramDescription": "nameNode of the source cluster", - "paramRequired": true - }, - { - "paramName": "plp", - "paramLongName": "portalLogPath", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "pmi", - "paramLongName": "portalMatomoID", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "iukbuw", - "paramLongName": "irusUKBaseURL", - "paramDescription": "working directory", - "paramRequired": true - }, - { - "paramName": "iukrp", - "paramLongName": "irusUKReportPath", - "paramDescription": "maximum number of map tasks used in the distcp process", - "paramRequired": true - }, - { - "paramName": "srpa", - "paramLongName": "sarcsReportPathArray", - "paramDescription": "memory for distcp action copying actionsets from remote cluster", - "paramRequired": true - }, - { - "paramName": "srpna", - "paramLongName": "sarcsReportPathNonArray", - "paramDescription": "timeout for distcp copying actions from remote cluster", - "paramRequired": true - }, - { - "paramName": "llp", - "paramLongName": "lareferenciaLogPath", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lbu", - "paramLongName": "lareferenciaBaseURL", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lat", - "paramLongName": "lareferenciaAuthToken", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "urdbs", - "paramLongName": "usageRawDataDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, { - "paramName": "usdbs", - "paramLongName": "usageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "piwikEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ppwl", - "paramLongName": "processPiwikLogs", - "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dpwl", - "paramLongName": "downloadPiwikLogs", - "paramDescription": "download piwik logs?", - "paramRequired": true - }, - { - "paramName": "slp", - "paramLongName": "startingLogPeriod", - "paramDescription": "Starting log period", - "paramRequired": true - }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, - { - "paramName": "npidd", - "paramLongName": "numberOfPiwikIdsToDownload", - "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", - "paramRequired": true - }, - { - "paramName": "nsidd", - "paramLongName": "numberOfSiteIdsToDownload", - "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", - "paramRequired": true - }, - { - "paramName": "lerd", - "paramLongName": "laReferenciaEmptyDirs", - "paramDescription": "Empty LaReferencia directories?", - "paramRequired": true - }, - { - "paramName": "plrl", - "paramLongName": "processLaReferenciaLogs", - "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dlrl", - "paramLongName": "downloadLaReferenciaLogs", - "paramDescription": "download La Referencia logs?", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "irusCreateTablesEmptyDirs", - "paramDescription": "Irus section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "irusDownloadReports", - "paramDescription": "Irus section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "irusProcessStats", - "paramDescription": "Irus section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "irusNumberOfOpendoarsToDownload", - "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "sarcCreateTablesEmptyDirs", - "paramDescription": "Sarc section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "sarcDownloadReports", - "paramDescription": "Sarc section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "sarcProcessStats", - "paramDescription": "Sarc section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "sarcNumberOfIssnToDownload", - "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", - "paramRequired": true - }, - - { - "paramName": "fs", - "paramLongName": "finalizeStats", - "paramDescription": "Create the usage_stats table?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, - { - "paramName": "nodt", - "paramLongName": "numberOfDownloadThreads", - "paramDescription": "Number of download threads", - "paramRequired": true - } + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "urdbs", + "paramLongName": "usageRawDataDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "usagestatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml index 37700539b..71e8a50d6 100644 --- a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -42,42 +42,24 @@ eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow - --matomoAuthToken${matomoAuthToken} - --matomoBaseURL${matomoBaseURL} --repoLogPath${repoLogPath} --portalLogPath${portalLogPath} --portalMatomoID${portalMatomoID} - --irusUKBaseURL${irusUKBaseURL} --irusUKReportPath${irusUKReportPath} --sarcsReportPathArray${sarcsReportPathArray} --sarcsReportPathNonArray${sarcsReportPathNonArray} --lareferenciaLogPath${lareferenciaLogPath} - --lareferenciaBaseURL${lareferenciaBaseURL} - --lareferenciaAuthToken${lareferenciaAuthToken} --dbHiveUrl${hiveJdbcUrl} --dbImpalaUrl${impalaJdbcUrl} --usageRawDataDBSchema${usageRawDataDBSchema} --usageStatsDBSchema${usageStatsDBSchema} + --usagestatsPermanentDBSchema${usagestatsPermanentDBSchema} --statsDBSchema${statsDBSchema} --recreateDbAndTables${recreateDbAndTables} - --piwikEmptyDirs${piwikEmptyDirs} - --downloadPiwikLogs${downloadPiwikLogs} --processPiwikLogs${processPiwikLogs} - --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} - --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} - --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} - --laReferenciaEmptyDirs${laReferenciaEmptyDirs} - --downloadLaReferenciaLogs${downloadLaReferenciaLogs} --processLaReferenciaLogs${processLaReferenciaLogs} - --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} - --irusDownloadReports${irusDownloadReports} --irusProcessStats${irusProcessStats} - --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} - --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} - --sarcDownloadReports${sarcDownloadReports} --sarcProcessStats${sarcProcessStats} - --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} --finalizeStats${finalizeStats} --finalTablesVisibleToImpala${finalTablesVisibleToImpala} --numberOfDownloadThreads${numberOfDownloadThreads} From 3e8d2a6b2d34691358a5e5f87dc072f195a0a5aa Mon Sep 17 00:00:00 2001 From: Dimitris Date: Fri, 15 Jan 2021 16:19:12 +0200 Subject: [PATCH 06/16] Clean workflows --- .../nb-configuration.xml | 18 - .../dhp-usage-datasets-stats-update/pom.xml | 112 -- .../runworkflow.sh | 1 - .../datasetsusagestats/export/ConnectDB.java | 131 -- .../export/DatasetsStatsDB.java | 168 --- .../DownloadReportsListFromDatacite.java | 102 -- .../export/ExecuteWorkflow.java | 69 - .../export/ReadReportsListFromDatacite.java | 408 ------ .../export/UsageStatsExporter.java | 111 -- .../datasets_usagestats_parameters.json | 56 - .../oozie_app/config-default.xml | 38 - .../datasetsusagestats/oozie_app/workflow.xml | 70 - dhp-workflows/dhp-usage-stats-update/pom.xml | 78 - .../oa/graph/usagestats/export/ConnectDB.java | 125 -- .../usagestats/export/ExecuteWorkflow.java | 197 --- .../oa/graph/usagestats/export/IrusStats.java | 419 ------ .../export/LaReferenciaDownloadLogs.java | 265 ---- .../usagestats/export/LaReferenciaStats.java | 436 ------ .../usagestats/export/PiwikDownloadLogs.java | 325 ----- .../graph/usagestats/export/PiwikStatsDB.java | 1262 ----------------- .../export/ReadCounterRobotsList.java | 54 - .../oa/graph/usagestats/export/SarcStats.java | 575 -------- .../usagestats/export/UsageStatsExporter.java | 179 --- .../export/usagestats_parameters.json | 231 --- .../usagestats/oozie_app/config-default.xml | 38 - .../graph/usagestats/oozie_app/workflow.xml | 90 -- 26 files changed, 5558 deletions(-) delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/pom.xml delete mode 100755 dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-usage-stats-update/pom.xml delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml deleted file mode 100644 index a65c4514a..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - JDK_1.8 - - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml deleted file mode 100644 index 5593d4d87..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ /dev/null @@ -1,112 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-usage-datasets-stats-update - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - com.mchange - c3p0 - 0.9.5.2 - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-usage-datasets-stats-update - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh deleted file mode 100755 index 9b4325508..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh +++ /dev/null @@ -1 +0,0 @@ -mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java deleted file mode 100644 index 25b30e8ad..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.Properties; - -import org.apache.log4j.Logger; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import com.mchange.v2.c3p0.ComboPooledDataSource; - -public abstract class ConnectDB { - - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; - - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String datasetUsageStatsDBSchema; - private static String statsDBSchema; - private final static Logger logger = Logger.getLogger(ConnectDB.class); - private Statement stmt = null; - - static void init() throws ClassNotFoundException { - - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } - - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); - - return DB_HIVE_CONNECTION; - } - } - - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); - - return DB_IMPALA_CONNECTION; - } - } - - public static String getDataSetUsageStatsDBSchema() { - return ConnectDB.datasetUsageStatsDBSchema; - } - - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } - - private static Connection connectHive() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setUser("dimitris.pierrakos"); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - - return cpds.getConnection(); - - } - - private static Connection connectImpala() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setUser("dimitris.pierrakos"); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - return cpds.getConnection(); - - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java deleted file mode 100644 index 88db1f819..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ /dev/null @@ -1,168 +0,0 @@ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.sql.Connection; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.*; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class DatasetsStatsDB { - - private String logPath; - private String logRepoPath; - private String logPortalPath; - - private Statement stmt = null; - - private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class); - - private String CounterRobotsURL; - private ArrayList robotsList; - - public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception { - this.logRepoPath = logRepoPath; - this.logPortalPath = logPortalPath; - - } - - public void recreateDBAndTables() throws Exception { - this.createDatabase(); - this.createTables(); - } - -// public void reCreateLogDirs() throws IllegalArgumentException, IOException { -// FileSystem dfs = FileSystem.get(new Configuration()); -// -// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); -// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); -// -// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); -// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); -// -// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); -// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); -// -// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); -// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); -// } - public ArrayList getRobotsList() { - return robotsList; - } - - public void setRobotsList(ArrayList robotsList) { - this.robotsList = robotsList; - } - - public String getCounterRobotsURL() { - return CounterRobotsURL; - } - - public void setCounterRobotsURL(String CounterRobotsURL) { - this.CounterRobotsURL = CounterRobotsURL; - } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Reports table - This table should exist - logger.info("Creating Reports Table"); - String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" - + " name STRING, \n" - + " source STRING,\n" - + " release STRING,\n" - + " createdby STRING,\n" - + " report_start_date STRING,\n" - + " report_end_date STRING)\n" - + " CLUSTERED BY (reportid)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - - stmt.executeUpdate(sqlCreateTableDataciteReports); - logger.info("Reports Table Created"); - - // Create Datasets Table - logger.info("Creating DataSets Table"); - String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasets(ds_type STRING,\n" - + " ds_title STRING,\n" - + " yop STRING,\n" - + " uri STRING,\n" - + " platform STRING,\n" - + " data_type STRING,\n" - + " publisher STRING,\n" - + " publisher_id_type STRING,\n" - + " publisher_id_value STRING,\n" - + " ds_dates_type STRING,\n" - + " ds_pub_date STRING,\n" - + " ds_contributors STRING,\n" - // + " ds_contributor_value array ,\n" - + " reportid STRING)\n" - + " CLUSTERED BY (ds_type)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableDataSets); - logger.info("DataSets Table Created"); - - // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Table"); - String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance(ds_type STRING,\n" - + " period_end STRING,\n" - + " period_from STRING,\n" - + " access_method STRING,\n" - + " metric_type STRING,\n" - + " count INT,\n" - + " country_counts STRING,\n" - + " reportid STRING)\n" - + " CLUSTERED BY (ds_type)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Table Created"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - private Connection getConnection() throws SQLException { - return ConnectDB.getHiveConnection(); - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java deleted file mode 100644 index a73b299ec..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; - -/** - * @author dpie - */ -public class DownloadReportsListFromDatacite { - - private String dataciteBaseURL; - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) - throws MalformedURLException, Exception { - - this.dataciteBaseURL = dataciteBaseURL; - this.dataciteReportPath = dataciteReportPath; - } - - public void downloadReportsList() throws ParseException { - StringBuilder responseStrBuilder = new StringBuilder(); - - Gson gson = new Gson(); - - try { - BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); - logger.info("Downloading from " + dataciteBaseURL); - - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder.append(inputStr); - } - } catch (IOException e) { - logger.info(e.getMessage()); - } - JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); - JsonArray dataArray = jsonObject.getAsJsonArray("reports"); - ArrayList reportsList = new ArrayList(); - for (JsonElement element : dataArray) { - reportsList.add(element.getAsJsonObject().get("id").getAsString()); - } - - Iterator it = reportsList.iterator(); - while (it.hasNext()) { - String reportId = it.next().toString(); - String url = dataciteBaseURL + reportId; - - try { - BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - StringBuilder responseStrBuilder2 = new StringBuilder(); - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder2.append(inputStr); - } - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(dataciteReportPath + "/" + reportId + ".json"), - true); - byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - fin.close(); - - fin.close(); - } catch (IOException e) { - System.out.println(e); - } - } - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java deleted file mode 100644 index b28578e4b..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class ExecuteWorkflow { - - static String dataciteBaseURL; - static String dataciteReportPath; - static String dbHiveUrl; - static String dbImpalaUrl; - static String datasetUsageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - static boolean datasetsEmptyDirs; - static boolean finalTablesVisibleToImpala; - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json"))); - parser.parseArgument(args); - - // Setting up the initial parameters - dataciteBaseURL = parser.get("dataciteBaseURL"); - dataciteReportPath = parser.get("dataciteReportPath"); - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); - - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) - recreateDbAndTables = true; - else - recreateDbAndTables = false; - - if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true")) - datasetsEmptyDirs = true; - else - datasetsEmptyDirs = false; - -// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) -// finalTablesVisibleToImpala = true; -// else -// finalTablesVisibleToImpala = false; -// - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java deleted file mode 100644 index ccb3eebd3..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ /dev/null @@ -1,408 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.*; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.sql.Array; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.ArrayList; -import java.util.Base64; -import java.util.Iterator; -import java.util.Set; -import java.util.zip.GZIPInputStream; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; - -/** - * @author dpie - */ -public class ReadReportsListFromDatacite { - - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception { - - this.dataciteReportPath = dataciteReportPath; - } - - public void readReports() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - File folder = new File(dataciteReportPath); - ArrayList jsonFiles = listHdfsDir(dataciteReportPath); - for (String jsonFile : jsonFiles) { - logger.info("Reading report file " + jsonFile); - this.createTmpReportsTable(jsonFile); - - String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelectReportID); - ResultSet rstmpReportID = stmt.getResultSet(); - - String reportID = null; - while (rstmpReportID.next()) { - reportID = rstmpReportID.getString(1); - } - - logger.info("Checking report with id " + reportID); - String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports where reportid=?"; - PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); - stGetReportID.setString(1, reportID); - - ResultSet rsCheckIfReportExist = stGetReportID.executeQuery(); - - if (rsCheckIfReportExist.next()) { - logger.info("Report found with ID " + reportID); - dropTmpReportsTable(); - } else { - String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports " - + "SELECT\n" - + " get_json_object(json, '$.report.id') AS reportid,\n" - + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" - + " get_json_object(json, '$.report.report-header.report-id') AS source,\n" - + " get_json_object(json, '$.report.report-header.release') AS release,\n" - + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n" - + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n" - + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertReport); - - logger.info("Report added"); - - logger.info("Adding datasets"); - String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsArray); - ResultSet rstmpReportDatasets = stmt.getResultSet(); - - if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) { - String[] listDatasets = rstmpReportDatasets.getString(1).split(","); - logger.info("Datasets found " + listDatasets.length); - - for (int i = 0; i < listDatasets.length; i++) { - - String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datasets " - + "SELECT\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-id[0].value') AS ds_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-title') AS ds_title,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].publisher-id.type[0]') AS publisher_id_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].publisher-id.value[0]') AS publisher_id_value,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-dates.type[0]') AS ds_dates_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-dates.value[0]') AS ds_dates_value,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-contributors') AS ds_contributors,\n" - + " get_json_object(json, '$.report.id') AS reportid \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertDataSets); - - logger.info("Dataset added " + i); - - logger.info("Adding Dataset Performance"); - String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets[" - + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsPerformance); - ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet(); - if (rstmpReportDatasetsPerformance.next() - && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) { - String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(","); - logger.info("Datasets Performance found " + listDatasetsPerformance.length); - for (int j = 0; j < listDatasetsPerformance.length; j++) { - String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets[" - + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsPerformanceInstance); - ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet(); - if (rstmpReportDatasetsPerformanceInstance.next() - && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) { - String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance - .getString(1) - .split(","); - logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length); - for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) { - String sqlInsertDataSetsPerformance = "INSERT INTO " - + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance " - + "SELECT\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-id[0].value') AS ds_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].period.end-date') AS period_end,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].period.begin-date') AS period_from,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].access-method') AS access_method,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].metric-type') AS metric_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k + "].count') AS count,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].country-counts') AS country_counts,\n" - + " get_json_object(json, '$.report.id') AS reportid \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertDataSetsPerformance); - } - } - } - } - logger.info("DatasetPerformance added for dataset" + i); - } - } - logger.info("Adding gzip performance"); - String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteReportSubsets); - ResultSet rstmpReportSubsets = stmt.getResultSet(); - if (rstmpReportSubsets.next()) { - String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1)); - this.readCompressedReport(unCompressedReport, reportID); - } - } - } - this.dropTmpReportsTable(); - } - - public void readCompressedReport(String report, String reportId) throws Exception { - Gson gson = new Gson(); - JsonObject jsonObject = gson.fromJson(report, JsonObject.class); - - JsonArray jsonReportDatasets; - if (jsonObject.getAsJsonArray("report_datasets") != null) { - jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets"); - } else { - jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets"); - } - - for (JsonElement datasetElement : jsonReportDatasets) { - // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title"); - String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString(); - String yop = datasetElement.getAsJsonObject().get("yop").getAsString(); - String uri = datasetElement.getAsJsonObject().get("uri").getAsString(); - String platform = datasetElement.getAsJsonObject().get("platform").getAsString(); - String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString(); - String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString(); - - JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id"); - String publisher_id_type = ""; - String publisher_id_value = ""; - for (JsonElement publisher_id_Element : publisher_id) { - publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString(); - publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString(); - } - JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates"); - String ds_dates_type = ""; - String ds_dates_value = ""; - for (JsonElement datasetDaysElement : dataset_days) { - ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString(); - ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString(); - } - - JsonArray datasetContributors = null; - String ds_contributor_type = ""; - String[] ds_contributor_values = null; - Array ds_contributor_valuesArr = null; - - if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) { - datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors"); - - JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id"); - String doi = ""; - for (JsonElement datasetIDElement : datasetid) -//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString()); - { - doi = datasetIDElement.getAsJsonObject().get("value").getAsString(); - } - - String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datasets(ds_type," - + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value," - + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) "; - - PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset); - - pstmtDataset.setString(1, doi); - pstmtDataset.setString(2, dataset_title); - pstmtDataset.setString(3, yop); - pstmtDataset.setString(4, uri); - pstmtDataset.setString(5, platform); - pstmtDataset.setString(6, data_type); - pstmtDataset.setString(7, publisher); - pstmtDataset.setString(8, publisher_id_type); - pstmtDataset.setString(9, publisher_id_value); - pstmtDataset.setString(10, ds_dates_type); - pstmtDataset.setString(11, ds_dates_value); - pstmtDataset.setString(13, datasetContributors.getAsString()); - pstmtDataset.setString(14, reportId); - - pstmtDataset.execute(); - logger.info("Dataset from compressed report addded " + doi); - /* - * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for - * (JsonElement performanceElement : performance) { JsonObject period = - * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date = - * period.getAsJsonObject().get("end-date").getAsString(); String begin_date = - * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance = - * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement : - * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject - * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set - * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[] - * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0; - * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] = - * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text", - * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype - * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod = - * instanceElement.getAsJsonObject().get("access-method").getAsString(); String - * sqlInsertDatasetPerformance = - * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)" - * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance); - * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count); - * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date); - * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod); - * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count); - * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8, - * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute(); - * } } - */ - } - } - - } - - private ArrayList listHdfsDir(String dir) throws Exception { - - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - fileNames.add(fileName); - } - - hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir)); - throw new Exception("HDFS file path with exported data does not exist : " + dir, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - sb.append(line); - // sb.append(line); - line = br.readLine(); - } - // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - result = sb.toString().trim(); - // fs.close(); - } catch (Exception e) { - throw new Exception(e); - } - - return result; - } - - public static String uncompressString(String zippedBase64Str) - throws IOException { - String result = null; - - // In my solr project, I use org.apache.solr.common.util.Base64. - // byte[] bytes = - // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str); - byte[] bytes = Base64.getDecoder().decode(zippedBase64Str); - GZIPInputStream zi = null; - try { - zi = new GZIPInputStream(new ByteArrayInputStream(bytes)); - result = IOUtils.toString(zi); - } finally { - IOUtils.closeQuietly(zi); - } - return result; - } - - private void createTmpReportsTable(String jsonFile) throws SQLException { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - dropTmpReportsTable(); - String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".tmpjson (json STRING)"; - stmt.executeUpdate(createTmpTable); - logger.info("Tmp Table Created"); - - String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(insertJsonReport); - logger.info("JSON Report File inserted to tmpjson Table"); - } - - private void dropTmpReportsTable() throws SQLException { - logger.info("Dropping tmpjson Table"); - String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - stmt.executeUpdate(dropTmpTable); - logger.info("Dropped tmpjson Table"); - - } - -} - -/* - * PreparedStatement prepStatem = conn. - * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)" - * ); - */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java deleted file mode 100644 index 7b07fbc25..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ /dev/null @@ -1,111 +0,0 @@ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.IOException; -import java.sql.SQLException; -import java.sql.Statement; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Main class for downloading and processing Usage statistics - * - * @author D. Pierrakos, S. Zoupanos - */ -public class UsageStatsExporter { - - private Statement stmt = null; - - public UsageStatsExporter() { - - } - - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); - - logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); - - } - - public void export() throws Exception { - - logger.info("Initialising DB properties"); - ConnectDB.init(); - ConnectDB.getHiveConnection(); - - if (ExecuteWorkflow.recreateDbAndTables) { - DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", ""); - datasetsDB.recreateDBAndTables(); - } - logger.info("Initializing the download logs module"); - DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, - ExecuteWorkflow.dataciteReportPath); - - if (ExecuteWorkflow.datasetsEmptyDirs) { - logger.info("Downloading Reports List From Datacite"); - drfd.downloadReportsList(); - logger.info("Reports List has been downloaded"); - } - - ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite( - ExecuteWorkflow.dataciteReportPath); - logger.info("Store Reports To DB"); - readReportsListFromDatacite.readReports(); - logger.info("Reports Stored To DB"); - } - -// runImpalaQuery(); - /* - * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module"); - * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories"); - * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if - * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs( - * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); } - * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl = - * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) { - * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables"); - * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) { - * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if - * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs"); - * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } - * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if - * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs(); - * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if - * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); - * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs"); - * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) { - * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) { - * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if - * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if - * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, - * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) { - * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if - * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the - * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) { - * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End"); - */ -} -/* - * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt = - * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " - * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close(); - * ConnectDB.getHiveConnection().close(); } - */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json deleted file mode 100644 index f8d51a882..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ /dev/null @@ -1,56 +0,0 @@ -[ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml deleted file mode 100644 index 3a81e497d..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow - --dataciteBaseURL - ${dataciteBaseURL} - --dataciteReportPath - ${dataciteReportPath} - --dbHiveUrl - ${hiveJdbcUrl} - --dbImpalaUrl - ${impalaJdbcUrl} - --datasetUsageStatsDBSchema - ${datasetUsageStatsDBSchema} - --statsDBSchema - ${statsDBSchema} - --recreateDbAndTables - ${recreateDbAndTables} - --datasetsEmptyDirs - ${datasetsEmptyDirs} - --finalTablesVisibleToImpala - ${finalTablesVisibleToImpala} - - - - - - - - diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml deleted file mode 100644 index b56257ee5..000000000 --- a/dhp-workflows/dhp-usage-stats-update/pom.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-usage-stats-update - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java deleted file mode 100644 index 29dd5648b..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.Properties; - -import org.apache.log4j.Logger; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import com.mchange.v2.c3p0.ComboPooledDataSource; - -public abstract class ConnectDB { - - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; - - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String usageStatsDBSchema; - private static String statsDBSchema; - private final static Logger log = Logger.getLogger(ConnectDB.class); - - static void init() throws ClassNotFoundException { - - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } - - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); - - return DB_HIVE_CONNECTION; - } - } - - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); - - return DB_IMPALA_CONNECTION; - } - } - - public static String getUsageStatsDBSchema() { - return ConnectDB.usageStatsDBSchema; - } - - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } - - private static Connection connectHive() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); - - } - - private static Connection connectImpala() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - return cpds.getConnection(); - - } - -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java deleted file mode 100644 index 50b951cbc..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class ExecuteWorkflow { - - static String matomoAuthToken; - static String matomoBaseURL; - static String repoLogPath; - static String portalLogPath; - static String portalMatomoID; - static String irusUKBaseURL; - static String irusUKReportPath; - static String sarcsReportPathArray; - static String sarcsReportPathNonArray; - static String lareferenciaLogPath; - static String lareferenciaBaseURL; - static String lareferenciaAuthToken; - static String dbHiveUrl; - static String dbImpalaUrl; - static String usageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - - static boolean piwikEmptyDirs; - static boolean downloadPiwikLogs; - static boolean processPiwikLogs; - - static Calendar startingLogPeriod; - static Calendar endingLogPeriod; - static int numberOfPiwikIdsToDownload; - static int numberOfSiteIdsToDownload; - - static boolean laReferenciaEmptyDirs; - static boolean downloadLaReferenciaLogs; - static boolean processLaReferenciaLogs; - - static boolean irusCreateTablesEmptyDirs; - static boolean irusDownloadReports; - static boolean irusProcessStats; - static int irusNumberOfOpendoarsToDownload; - - static boolean sarcCreateTablesEmptyDirs; - static boolean sarcDownloadReports; - static boolean sarcProcessStats; - static int sarcNumberOfIssnToDownload; - - static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; - - static int numberOfDownloadThreads; - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json"))); - parser.parseArgument(args); - - // Setting up the initial parameters - matomoAuthToken = parser.get("matomoAuthToken"); - matomoBaseURL = parser.get("matomoBaseURL"); - repoLogPath = parser.get("repoLogPath"); - portalLogPath = parser.get("portalLogPath"); - portalMatomoID = parser.get("portalMatomoID"); - irusUKBaseURL = parser.get("irusUKBaseURL"); - irusUKReportPath = parser.get("irusUKReportPath"); - sarcsReportPathArray = parser.get("sarcsReportPathArray"); - sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); - lareferenciaLogPath = parser.get("lareferenciaLogPath"); - lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); - lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); - - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - usageStatsDBSchema = parser.get("usageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); - - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) - recreateDbAndTables = true; - else - recreateDbAndTables = false; - - if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) - piwikEmptyDirs = true; - else - piwikEmptyDirs = false; - - if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) - downloadPiwikLogs = true; - else - downloadPiwikLogs = false; - - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) - processPiwikLogs = true; - else - processPiwikLogs = false; - - String startingLogPeriodStr = parser.get("startingLogPeriod"); - Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); - startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); - - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); - - numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); - numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); - - if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) - laReferenciaEmptyDirs = true; - else - laReferenciaEmptyDirs = false; - - if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) - downloadLaReferenciaLogs = true; - else - downloadLaReferenciaLogs = false; - - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) - processLaReferenciaLogs = true; - else - processLaReferenciaLogs = false; - - if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) - irusCreateTablesEmptyDirs = true; - else - irusCreateTablesEmptyDirs = false; - if (parser.get("irusDownloadReports").toLowerCase().equals("true")) - irusDownloadReports = true; - else - irusDownloadReports = false; - if (parser.get("irusProcessStats").toLowerCase().equals("true")) - irusProcessStats = true; - else - irusProcessStats = false; - irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); - - if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) - sarcCreateTablesEmptyDirs = true; - else - sarcCreateTablesEmptyDirs = false; - if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) - sarcDownloadReports = true; - else - sarcDownloadReports = false; - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) - sarcProcessStats = true; - else - sarcProcessStats = false; - sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); - - if (parser.get("finalizeStats").toLowerCase().equals("true")) - finalizeStats = true; - else - finalizeStats = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - - numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); - - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - - private static Calendar startingLogPeriodStr(Date date) { - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(date); - return calendar; - - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java deleted file mode 100644 index 011c90532..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java +++ /dev/null @@ -1,419 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class IrusStats { - - private String irusUKURL; - - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - - public IrusStats(String irusUKURL) throws Exception { - this.irusUKURL = irusUKURL; - // The following may not be needed - It will be created when JSON tables are created -// createTmpTables(); - } - - public void reCreateLogDirs() throws Exception { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - } - - public void createTables() throws Exception { - try { - logger.info("Creating sushilog"); - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog(source STRING, " - + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " - + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableSushiLog); - logger.info("Created sushilog"); - - // To see how to apply to the ignore duplicate rules and indexes -// stmt.executeUpdate(sqlCreateTableSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilog " -// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," -// + "sushilog.rid, sushilog.date " -// + "FROM sushilog " -// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; -// stmt.executeUpdate(createSushiIndex); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - -// // The following may not be needed - It will be created when JSON tables are created -// private void createTmpTables() throws Exception { -// try { -// -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; -// stmt.executeUpdate(sqlCreateTableSushiLog); -// -// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); -// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; -// // stmt.executeUpdate(sqlCopyPublicSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilogtmp " -// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," -// + "sushilogtmp.rid, sushilogtmp.date " -// + "FROM sushilogtmp " -// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// -// stmt.close(); -// ConnectDB.getConnection().close(); -// log.info("Sushi Tmp Tables Created"); -// } catch (Exception e) { -// log.error("Failed to create tables: " + e); -// throw new Exception("Failed to create tables: " + e.toString(), e); -// } -// } - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping sushilogtmp_json table"); - String dropSushilogtmpJson = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sushilogtmp_json"; - stmt.executeUpdate(dropSushilogtmpJson); - logger.info("Dropped sushilogtmp_json table"); - - logger.info("Creating irus_sushilogtmp_json table"); - String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " Type: STRING,\n" - + " Value: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` ARRAY<\n" - + " struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + " >\n" - + ")\n" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(createSushilogtmpJson); - logger.info("Created irus_sushilogtmp_json table"); - - logger.info("Dropping irus_sushilogtmp table"); - String dropSushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(dropSushilogtmp); - logger.info("Dropped irus_sushilogtmp table"); - - logger.info("Creating irus_sushilogtmp table"); - String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(createSushilogtmp); - logger.info("Created irus_sushilogtmp table"); - - logger.info("Inserting to irus_sushilogtmp table"); - String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " - + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " - + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " - + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " - + "WHERE `ItemIdent`.`Type`= 'OAI'"; - stmt.executeUpdate(insertSushilogtmp); - logger.info("Inserted to irus_sushilogtmp table"); - - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); - - logger.info("Inserting into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - logger.info("Inserting to sushilog table"); - String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(insertToShushilog); - logger.info("Inserted to sushilog table"); - - ConnectDB.getHiveConnection().close(); - } - - public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); - - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) - + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; - - logger.info("(getIrusRRReport) Getting report: " + reportUrl); - - String text = getJson(reportUrl, "", ""); - - List opendoarsToVisit = new ArrayList(); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - int i = 0; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); - for (Object identifier : itemIdentifier) { - JSONObject opendoar = (JSONObject) identifier; - if (opendoar.get("Type").toString().equals("OpenDOAR")) { - i++; - opendoarsToVisit.add(opendoar.get("Value").toString()); - break; - } - } - // break; - } - - logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); - - if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 - && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - } - - logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); - - for (String opendoar : opendoarsToVisit) { - logger.info("Now working on openDoar: " + opendoar); - this.getIrusIRReport(opendoar, irusUKReportPath); - } - - logger.info("(getIrusRRReport) Finished with report: " + reportUrl); - } - - private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { - - logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); - - ConnectDB.getHiveConnection().setAutoCommit(false); - - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, "opendoar____::" + opendoar); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - int batch_size = 0; - - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); - } else { - while (start.before(end)) { - logger.info("date: " + simpleDateFormat.format(start.getTime())); - String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) - + "&RepositoryIdentifier=opendoar%3A" + opendoar - + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; - start.add(Calendar.MONTH, 1); - - logger.info("Downloading file: " + reportUrl); - String text = getJson(reportUrl, "", ""); - if (text == null) { - continue; - } - - FileSystem fs = FileSystem.get(new Configuration()); - String filePath = irusUKReportPath + "/" + "IrusIRReport_" - + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePath); - FSDataOutputStream fin = fs.create(new Path(filePath), true); - - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - if (jsonArray == null) { - continue; - } - String oai = ""; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - fin.write(jsonObjectRow.toJSONString().getBytes()); - fin.writeChar('\n'); - } - - fin.close(); - } - - } - //ConnectDB.getHiveConnection().close(); - - logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); - } - - private String getJson(String url) throws Exception { - try { - System.out.println("===> Connecting to: " + url); - URL website = new URL(url); - System.out.println("Connection url -----> " + url); - URLConnection connection = website.openConnection(); - - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); -// response.append("\n"); - } - } - - System.out.println("response ====> " + response.toString()); - - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } - - private String getJson(String url, String username, String password) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL", e); - return null; - } - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java deleted file mode 100644 index 7a61b1f46..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java +++ /dev/null @@ -1,265 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class LaReferenciaDownloadLogs { - - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; - - /* - * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; - - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); - - public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - this.createTables(); -// this.createTmpTables(); - } - - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } - - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " - + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } - -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - private String getPiwikLogUrl() { - return piwikUrl + "/"; - } - - private String getJson(String url) throws Exception { - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); -// response.append("\n"); - } - } - - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } - - public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { - - String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; - String content = ""; - - List siteIdsToVisit = new ArrayList(); - - // Getting all the siteIds in a list for logging reasons & limiting the list - // to the max number of siteIds - content = getJson(baseApiUrl); - JSONParser parser = new JSONParser(); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); - } - logger.info("Found the following siteIds for download: " + siteIdsToVisit); - - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } - - logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - - for (int siteId : siteIdsToVisit) { - logger.info("Now working on LaReferencia MatomoId: " + siteId); - this.GetLaReFerenciaLogs(repoLogsPath, siteId); - } - } - - public void GetLaReFerenciaLogs(String repoLogsPath, - int laReferencialMatomoID) throws Exception { - - logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); - - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".lareferencialog WHERE matomoid=?"); - st.setInt(1, laReferencialMatomoID); - Date dateMax = null; - - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - Date date = currDay.getTime(); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); - } else { - logger - .info( - "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " - + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - outFolder = repoLogsPath; - - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), - true); - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - int i = 0; - - JSONParser parser = new JSONParser(); - do { - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - fin.write(jsonObjectRaw.toJSONString().getBytes()); - fin.writeChar('\n'); - } - - logger - .info( - "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID - + " and for " - + sdf.format(date)); - i++; - } while (true); - fin.close(); - } - } - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java deleted file mode 100644 index 747b5ce0e..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java +++ /dev/null @@ -1,436 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URLDecoder; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.sql.Statement; -import java.sql.Timestamp; -import java.text.SimpleDateFormat; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class LaReferenciaStats { - - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); - - private String logRepoPath; - - private Statement stmt = null; - - private String CounterRobotsURL; - private ArrayList robotsList; - - public LaReferenciaStats(String logRepoPath) throws Exception { - this.logRepoPath = logRepoPath; - this.createTables(); -// this.createTmpTables(); - } - - /* - * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) { - * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } } - */ - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + - "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + - "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + - "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } - -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - - public void processLogs() throws Exception { - try { - logger.info("Processing LaReferencia repository logs"); - processlaReferenciaLog(); - logger.info("LaReferencia repository logs process done"); - - logger.info("LaReferencia removing double clicks"); - removeDoubleClicks(); - logger.info("LaReferencia removed double clicks"); - - logger.info("LaReferencia creating viewsStats"); - viewsStats(); - logger.info("LaReferencia created viewsStats"); - logger.info("LaReferencia creating downloadsStats"); - downloadsStats(); - logger.info("LaReferencia created downloadsStats"); - logger.info("LaReferencia updating Production Tables"); - updateProdTables(); - logger.info("LaReferencia updated Production Tables"); - - } catch (Exception e) { - logger.error("Failed to process logs: " + e); - throw new Exception("Failed to process logs: " + e.toString(), e); - } - } - - public void processlaReferenciaLog() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping lareferencialogtmp_json table"); - String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp_json"; - stmt.executeUpdate(drop_lareferencialogtmp_json); - logger.info("Dropped lareferencialogtmp_json table"); - - logger.info("Creating lareferencialogtmp_json"); - String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `repItem` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " timestamp: STRING,\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >,\n" + - " `2`: struct<\n" + - " `customVariablePageValue2`: STRING\n" + - " >\n" + - " >\n" + - " >\n" + - " >" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_lareferencialogtmp_json); - logger.info("Created lareferencialogtmp_json"); - - logger.info("Dropping lareferencialogtmp table"); - String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp"; - stmt.executeUpdate(drop_lareferencialogtmp); - logger.info("Dropped lareferencialogtmp table"); - - logger.info("Creating lareferencialogtmp"); - String create_lareferencialogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " + - "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + - "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + - "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_lareferencialogtmp); - logger.info("Created lareferencialogtmp"); - - logger.info("Inserting into lareferencialogtmp"); - String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " + - "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " + - "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_lareferencialogtmp); - logger.info("Inserted into lareferencialogtmp"); - - stmt.close(); - } - - public void removeDoubleClicks() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning download double clicks"); - // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " + - "AND p1.timestamp listHdfsDir(String dir) throws Exception { - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - // log.info("Found hdfs file " + fileName); - fileNames.add(fileName); - } - // hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath)); - throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - if (!line.equals("[]")) { - sb.append(line); - } - // sb.append(line); - line = br.readLine(); - } - result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - if (result.equals("")) { - result = "[]"; - } - - // fs.close(); - } catch (Exception e) { - logger.error(e.getMessage()); - throw new Exception(e); - } - - return result; - } - -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java deleted file mode 100644 index 8f7fffa9f..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java +++ /dev/null @@ -1,325 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.Authenticator; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class PiwikDownloadLogs { - - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; - - /* - * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - - private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); - - public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - - } - - private String getPiwikLogUrl() { - return "https://" + piwikUrl + "/"; - } - - private String getJson(String url) throws Exception { - try { - logger.debug("Connecting to download the JSON: " + url); - URL website = new URL(url); - URLConnection connection = website.openConnection(); - - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + url + " Exception: " + e); - throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); - } - } - - class WorkerThread implements Runnable { - - private Calendar currDay; - private int siteId; - private String repoLogsPath; - private String portalLogPath; - private String portalMatomoID; - - public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws IOException { - this.currDay = (Calendar) currDay.clone(); - this.siteId = new Integer(siteId); - this.repoLogsPath = new String(repoLogsPath); - this.portalLogPath = new String(portalLogPath); - this.portalMatomoID = new String(portalMatomoID); - } - - public void run() { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - System.out - .println( - Thread.currentThread().getName() + " (Start) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - try { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - System.out - .println( - Thread.currentThread().getName() + " (End) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - } - - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - - int i = 0; - - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); - - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - writtenBytes += jsonObjectRawBytes.length + 1; - } - - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); - - i++; - } while (true); - - fs.close(); - } - } - - public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { - - Statement statement = ConnectDB.getHiveConnection().createStatement(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - ResultSet rs = statement - .executeQuery( - "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); - - // Getting all the piwikids in a list for logging reasons & limitting the list - // to the max number of piwikids - List piwikIdToVisit = new ArrayList(); - while (rs.next()) - piwikIdToVisit.add(rs.getInt(1)); - logger.info("Found the following piwikIds for download: " + piwikIdToVisit); - - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { - logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } - - logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); - - // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); - - logger.info("Now working on piwikId: " + siteId); - - PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog WHERE source=?"); - st.setInt(1, siteId); - Date dateMax = null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); - - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - // logger.info("Date used " + currDay.toString()); - // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - // executor.execute(worker);// calling execute method of ExecutorService - logger.info("Date used " + currDay.getTime().toString()); - - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); - } else { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } - - } - } - // executor.shutdown(); - // while (!executor.isTerminated()) { - // } - // System.out.println("Finished all threads"); - } - - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - - int i = 0; - - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); - - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - writtenBytes += jsonObjectRawBytes.length + 1; - } - - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); - - i++; - } while (true); - - fs.close(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java deleted file mode 100644 index 6d5bdfac0..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ /dev/null @@ -1,1262 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URLDecoder; -import java.sql.Connection; -import java.sql.SQLException; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.*; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class PiwikStatsDB { - - private String logPath; - private String logRepoPath; - private String logPortalPath; - - private Statement stmt = null; - - private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - - private String CounterRobotsURL; - private ArrayList robotsList; - - public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { - this.logRepoPath = logRepoPath; - this.logPortalPath = logPortalPath; - - } - - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); - - logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); - - logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); - - logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); - } - - public void recreateDBAndTables() throws Exception { - this.createDatabase(); - this.createTables(); - // The piwiklog table is not needed since it is built - // on top of JSON files - this.createTmpTables(); - } - - public ArrayList getRobotsList() { - return robotsList; - } - - public void setRobotsList(ArrayList robotsList) { - this.robotsList = robotsList; - } - - public String getCounterRobotsURL() { - return CounterRobotsURL; - } - - public void setCounterRobotsURL(String CounterRobotsURL) { - this.CounterRobotsURL = CounterRobotsURL; - } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Piwiklog table - This table should exist - String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) " - + "into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePiwikLog); - - ///////////////////////////////////////// - // Rule for duplicate inserts @ piwiklog - ///////////////////////////////////////// - - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log - ////////////////////////////////////////////////// - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - private void createTmpTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePiwikLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ piwiklogtmp - ////////////////////////////////////////////////// - - ////////////////////////////////////////////////// - // Copy from public.piwiklog to piwiklog - ////////////////////////////////////////////////// - // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; - // stmt.executeUpdate(sqlCopyPublicPiwiklog); - - String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log_tmp - ////////////////////////////////////////////////// - - stmt.close(); - - } catch (Exception e) { - logger.error("Failed to create tmptables: " + e); - throw new Exception("Failed to create tmp tables: " + e.toString(), e); - // System.exit(0); - } - } - - public void processLogs() throws Exception { - try { - ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); - this.robotsList = counterRobots.getRobotsPatterns(); - - logger.info("Processing repository logs"); - processRepositoryLog(); - logger.info("Repository logs process done"); - - logger.info("Removing double clicks"); - removeDoubleClicks(); - logger.info("Removing double clicks done"); - - logger.info("Cleaning oai"); - cleanOAI(); - logger.info("Cleaning oai done"); - - logger.info("Processing portal logs"); - processPortalLog(); - logger.info("Portal logs process done"); - - logger.info("Processing portal usagestats"); - portalStats(); - logger.info("Portal usagestats process done"); - - logger.info("ViewsStats processing starts"); - viewsStats(); - logger.info("ViewsStats processing ends"); - - logger.info("DownloadsStats processing starts"); - downloadsStats(); - logger.info("DownloadsStats processing starts"); - - logger.info("Updating Production Tables"); - updateProdTables(); - logger.info("Updated Production Tables"); - - } catch (Exception e) { - logger.error("Failed to process logs: " + e); - throw new Exception("Failed to process logs: " + e.toString(), e); - } - } - - public void processRepositoryLog() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping piwiklogtmp_json table"); - String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json"; - stmt.executeUpdate(drop_piwiklogtmp_json); - logger.info("Dropped piwiklogtmp_json table"); - - logger.info("Creating piwiklogtmp_json"); - String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >\n" + - " >,\n" + - " timestamp: String\n" + - " >\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_piwiklogtmp_json); - logger.info("Created piwiklogtmp_json"); - - logger.info("Dropping piwiklogtmp table"); - String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp"; - stmt.executeUpdate(drop_piwiklogtmp); - logger.info("Dropped piwiklogtmp"); - - logger.info("Creating piwiklogtmp"); - String create_piwiklogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_piwiklogtmp); - logger.info("Created piwiklogtmp"); - - logger.info("Inserting into piwiklogtmp"); - String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent\n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_piwiklogtmp); - logger.info("Inserted into piwiklogtmp"); - - stmt.close(); - } - - public void removeDoubleClicks() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning download double clicks"); - // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "WHERE EXISTS (\n" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" - + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + - "AND p1.timestamp\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_process_portal_log_tmp_json); - logger.info("Created process_portal_log_tmp_json"); - - logger.info("Droping process_portal_log_tmp table"); - String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp"; - stmt.executeUpdate(drop_process_portal_log_tmp); - logger.info("Dropped process_portal_log_tmp"); - - logger.info("Creating process_portal_log_tmp"); - String create_process_portal_log_tmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_process_portal_log_tmp); - logger.info("Created process_portal_log_tmp"); - - logger.info("Inserting into process_portal_log_tmp"); - String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " - + - "actiondetail.url as url, " + - "CASE\n" + - " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + - " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " - + - " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + - " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + - " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + - " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + - " ELSE '' " + - "END AS entity_id, " + - "CASE " + - " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + - " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + - " ELSE '' " + - "END AS source_item_type, " + - "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + - "browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_process_portal_log_tmp); - logger.info("Inserted into process_portal_log_tmp"); - - stmt.close(); - } - - public void portalStats() throws SQLException { - Connection con = ConnectDB.getHiveConnection(); - Statement stmt = con.createStatement(); - con.setAutoCommit(false); - -// Original queries where of the style -// -// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp2, -// openaire_prod_stats_20200821.result_oids roid -// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null -// -// The following query is an example of how queries should be -// -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); -// -// We should consider if we would like the queries to be as the following -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND -// roid.oid != ''); - - logger.info("PortalStats - Step 1"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".result_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("PortalStats - Step 2"); - stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - /* - * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " + - * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent " - * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + - * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close(); - */ - logger.info("PortalStats - Step 3"); - stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".project_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - con.close(); - } - - private void cleanOAI() throws Exception { - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning oai - Step 1"); - stmt = ConnectDB.getHiveConnection().createStatement(); - String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + - "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 2"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + - "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 3"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + - "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 4"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + - "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 5"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + - "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 6"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + - "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 7"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + - "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 8"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + - "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 9"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + - "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 10"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + - "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 11"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + - "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 12"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + - "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 13"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + - "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 14"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + - "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 15"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + - "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 16"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + - "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 17"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + - "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 18"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + - "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 19"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + - "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 20"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + - "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 21"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + - "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 22"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + - "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 23"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + - "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 24"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + - "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 25"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + - "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 26"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + - "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 27"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + - "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 28"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + - "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 29"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + - "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Done, closing connection"); - ConnectDB.getHiveConnection().close(); - } - - private String processPortalURL(String url) { - - if (url.indexOf("explore.openaire.eu") > 0) { - try { - url = URLDecoder.decode(url, "UTF-8"); - } catch (Exception e) { - logger.info("Error when decoding the following URL: " + url); - } - if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); - } else if (url.indexOf("datasource=") > 0 - && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { - url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); - } else if (url.indexOf("datasourceFilter=") > 0 - && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); - } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); - } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); - } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 - && !url.contains("oai:dnet:corda")) { - url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); - } else if (url.indexOf("organizationId=") > 0 - && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { - url = "organization|" - + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); - } else { - url = ""; - } - } else { - url = ""; - } - - return url; - } - - private void updateProdTables() throws SQLException { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Inserting data to piwiklog"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to pageviews_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Creating usage_stats table"); - String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "AS SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(createUsageStats); - logger.info("Created usage_stats table"); - - - /* - * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); - */ - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } - - private ArrayList listHdfsDir(String dir) throws Exception { - - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - fileNames.add(fileName); - } - - hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); - throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - if (!line.equals("[]")) { - sb.append(line); - } - // sb.append(line); - line = br.readLine(); - } - result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - if (result.equals("")) { - result = "[]"; - } - - // fs.close(); - } catch (Exception e) { - logger.error(e.getMessage()); - throw new Exception(e); - } - - return result; - } - - private Connection getConnection() throws SQLException { - return ConnectDB.getHiveConnection(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java deleted file mode 100644 index 1708a1c64..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.ArrayList; - -import org.json.JSONException; -import org.json.simple.JSONArray; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; - -public class ReadCounterRobotsList { - - private ArrayList robotsPatterns = new ArrayList(); - private String COUNTER_ROBOTS_URL; - - public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { - COUNTER_ROBOTS_URL = url; - robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); - } - - private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { - InputStream is = new URL(url).openStream(); - JSONParser parser = new JSONParser(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); - JSONArray jsonArray = (JSONArray) parser.parse(reader); - for (Object aJsonArray : jsonArray) { - org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; - robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); - } - return robotsPatterns; - } - - public ArrayList getRobotsPatterns() { - return robotsPatterns; - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java deleted file mode 100644 index 06e350c9e..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java +++ /dev/null @@ -1,575 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -// import java.io.BufferedReader; -// import java.io.InputStreamReader; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class SarcStats { - - private Statement stmtHive = null; - private Statement stmtImpala = null; - - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - - public SarcStats() throws Exception { -// createTables(); - } - - private void createTables() throws Exception { - try { - - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); - - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); - - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - public void reCreateLogDirs() throws IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); - - logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); - - logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); - - logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); - } - - public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping sarc_sushilogtmp_json_array table"); - String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); - logger.info("Dropped sarc_sushilogtmp_json_array table"); - - logger.info("Creating sarc_sushilogtmp_json_array table"); - String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - logger.info("Created sarc_sushilogtmp_json_array table"); - - logger.info("Dropping sarc_sushilogtmp_json_non_array table"); - String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - logger.info("Dropped sarc_sushilogtmp_json_non_array table"); - - logger.info("Creating sarc_sushilogtmp_json_non_array table"); - String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" - + " `ItemIdentifier` struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathNonArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - logger.info("Created sarc_sushilogtmp_json_non_array table"); - - logger.info("Creating sarc_sushilogtmp table"); - String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(create_sarc_sushilogtmp); - logger.info("Created sarc_sushilogtmp table"); - - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "WHERE `ItemIdent`.`Type`='DOI'"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - - ConnectDB.getHiveConnection().close(); - } - - public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - logger.info("Dropping sarc_sushilogtmp table"); - String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp"; - stmt.executeUpdate(drop_sarc_sushilogtmp); - logger.info("Dropped sarc_sushilogtmp table"); - ConnectDB.getHiveConnection().close(); - - List issnAndUrls = new ArrayList(); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" - }); - issnAndUrls.add(new String[]{ - "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" - }); - - if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 - && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); - issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); - } - - logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); - - for (String[] issnAndUrl : issnAndUrls) { - logger.info("Now working on ISSN: " + issnAndUrl[1]); - getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); - } - - } - - public void finalizeSarcStats() throws Exception { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - - logger.info("Creating downloads_stats table_tmp"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats_tmp table"); - - logger.info("Dropping sarc_sushilogtmp_impala table"); - String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala"; - stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); - logger.info("Dropped sarc_sushilogtmp_impala table"); - - logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); - String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala " - + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(createSarcSushilogtmpImpala); - logger.info("Created sarc_sushilogtmp_impala"); - - logger.info("Making sarc_sushilogtmp visible to impala"); - String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala;"; - stmtImpala.executeUpdate(invalidateMetadata); - - logger.info("Dropping downloads_stats_impala table"); - String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala"; - stmtHive.executeUpdate(drop_downloads_stats_impala); - logger.info("Dropped downloads_stats_impala table"); - - logger.info("Making downloads_stats_impala deletion visible to impala"); - try { - String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala;"; - stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); - } catch (SQLException sqle) { - } - - // We run the following query in Impala because it is faster - logger.info("Creating downloads_stats_impala"); - String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala AS " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmtImpala.executeUpdate(createDownloadsStatsImpala); - logger.info("Creating downloads_stats_impala"); - - // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp SELECT * " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; - stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats_tmp"); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmtHive.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - // Insert into sushilog - logger.info("Inserting into sushilog"); - String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(insertSushiLog); - logger.info("Inserted into sushilog"); - - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - } - - public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, - String url, String issn) throws Exception { - logger.info("Processing SARC! issn: " + issn + " with url: " + url); - ConnectDB.getHiveConnection().setAutoCommit(false); - - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, issn); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - // Creating the needed configuration for the correct storing of data - Configuration config = new Configuration(); - config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); - config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); - config - .set( - "fs.hdfs.impl", - org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - config - .set( - "fs.file.impl", - org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem dfs = FileSystem.get(config); - - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); - } else { - - while (start.before(end)) { - String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); - start.add(Calendar.MONTH, 1); - - logger.info("(getARReport) Getting report: " + reportUrl); - String text = getJson(reportUrl); - if (text == null) { - continue; - } - - JSONParser parser = new JSONParser(); - JSONObject jsonObject = null; - try { - jsonObject = (JSONObject) parser.parse(text); - } // if there is a parsing error continue with the next url - catch (ParseException pe) { - continue; - } - - jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("sc:Report"); - if (jsonObject == null) { - continue; - } - jsonObject = (JSONObject) jsonObject.get("c:Report"); - jsonObject = (JSONObject) jsonObject.get("c:Customer"); - Object obj = jsonObject.get("c:ReportItems"); - JSONArray jsonArray = new JSONArray(); - if (obj instanceof JSONObject) { - jsonArray.add(obj); - } else { - jsonArray = (JSONArray) obj; - // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); - } - if (jsonArray == null) { - continue; - } - - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathArray); - FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); - - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathNonArray); - FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); - - for (Object aJsonArray : jsonArray) { - - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - renameKeysRecursively(":", jsonObjectRow); - - if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { - finNonArray.write(jsonObjectRow.toJSONString().getBytes()); - finNonArray.writeChar('\n'); - } else { - finArray.write(jsonObjectRow.toJSONString().getBytes()); - finArray.writeChar('\n'); - } - } - - finArray.close(); - finNonArray.close(); - - // Check the file size and if it is too big, delete it - File fileArray = new File(filePathArray); - if (fileArray.length() == 0) - fileArray.delete(); - File fileNonArray = new File(filePathNonArray); - if (fileNonArray.length() == 0) - fileNonArray.delete(); - - } - - dfs.close(); - } - //ConnectDB.getHiveConnection().close(); - } - - private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { - for (Object jjval : givenJsonObj) { - if (jjval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jjval); - } else if (jjval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jjval); - } // All other types of vals - else - ; - } - } - - private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { - Set jkeys = new HashSet(givenJsonObj.keySet()); - for (String jkey : jkeys) { - - String[] splitArray = jkey.split(delimiter); - String newJkey = splitArray[splitArray.length - 1]; - - Object jval = givenJsonObj.get(jkey); - givenJsonObj.remove(jkey); - givenJsonObj.put(newJkey, jval); - - if (jval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jval); - } - - if (jval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jval); - } - } - } - - private String getJson(String url) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - - // Logging error and silently continuing - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); -// return null; -// throw new Exception("Failed to get URL: " + e.toString(), e); - } - return ""; - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java deleted file mode 100644 index 405b58bd5..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ /dev/null @@ -1,179 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.IOException; -import java.sql.SQLException; -import java.sql.Statement; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Main class for downloading and processing Usage statistics - * - * @author D. Pierrakos, S. Zoupanos - */ -public class UsageStatsExporter { - - public UsageStatsExporter() { - - } - - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); - - logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); - - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - - logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); - - logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); - - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } - - public void export() throws Exception { - - logger.info("Initialising DB properties"); - ConnectDB.init(); - -// runImpalaQuery(); - - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - - logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) - piwikstatsdb.recreateDBAndTables(); - ; - - logger.info("Initializing the download logs module"); - PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - - if (ExecuteWorkflow.piwikEmptyDirs) { - logger.info("Recreating Piwik log directories"); - piwikstatsdb.reCreateLogDirs(); - } - - // Downloading piwik logs (also managing directory creation) - if (ExecuteWorkflow.downloadPiwikLogs) { - logger.info("Downloading piwik logs"); - piwd - .GetOpenAIRELogs( - ExecuteWorkflow.repoLogPath, - ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); - } - logger.info("Downloaded piwik logs"); - - // Create DB tables, insert/update statistics - String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - piwikstatsdb.setCounterRobotsURL(cRobotsUrl); - - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } - - logger.info("Creating LaReferencia tables"); - LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - ExecuteWorkflow.lareferenciaAuthToken); - - if (ExecuteWorkflow.laReferenciaEmptyDirs) { - logger.info("Recreating LaReferencia log directories"); - lrf.reCreateLogDirs(); - } - - if (ExecuteWorkflow.downloadLaReferenciaLogs) { - logger.info("Downloading LaReferencia logs"); - lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); - logger.info("Downloaded LaReferencia logs"); - } - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); - - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); - if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { - logger.info("Creating Irus Stats tables"); - irusstats.createTables(); - logger.info("Created Irus Stats tables"); - - logger.info("Re-create log dirs"); - irusstats.reCreateLogDirs(); - logger.info("Re-created log dirs"); - } - - if (ExecuteWorkflow.irusDownloadReports) { - irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); - } - if (ExecuteWorkflow.irusProcessStats) { - irusstats.processIrusStats(); - logger.info("Irus done"); - } - - SarcStats sarcStats = new SarcStats(); - if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { - sarcStats.reCreateLogDirs(); - } - if (ExecuteWorkflow.sarcDownloadReports) { - sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); - } - logger.info("Sarc done"); - - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - } - - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json deleted file mode 100644 index 988c23b48..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json +++ /dev/null @@ -1,231 +0,0 @@ -[ - { - "paramName": "mat", - "paramLongName": "matomoAuthToken", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "mbu", - "paramLongName": "matomoBaseURL", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "rlp", - "paramLongName": "repoLogPath", - "paramDescription": "nameNode of the source cluster", - "paramRequired": true - }, - { - "paramName": "plp", - "paramLongName": "portalLogPath", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "pmi", - "paramLongName": "portalMatomoID", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "iukbuw", - "paramLongName": "irusUKBaseURL", - "paramDescription": "working directory", - "paramRequired": true - }, - { - "paramName": "iukrp", - "paramLongName": "irusUKReportPath", - "paramDescription": "maximum number of map tasks used in the distcp process", - "paramRequired": true - }, - { - "paramName": "srpa", - "paramLongName": "sarcsReportPathArray", - "paramDescription": "memory for distcp action copying actionsets from remote cluster", - "paramRequired": true - }, - { - "paramName": "srpna", - "paramLongName": "sarcsReportPathNonArray", - "paramDescription": "timeout for distcp copying actions from remote cluster", - "paramRequired": true - }, - { - "paramName": "llp", - "paramLongName": "lareferenciaLogPath", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lbu", - "paramLongName": "lareferenciaBaseURL", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lat", - "paramLongName": "lareferenciaAuthToken", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "usdbs", - "paramLongName": "usageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "piwikEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ppwl", - "paramLongName": "processPiwikLogs", - "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dpwl", - "paramLongName": "downloadPiwikLogs", - "paramDescription": "download piwik logs?", - "paramRequired": true - }, - { - "paramName": "slp", - "paramLongName": "startingLogPeriod", - "paramDescription": "Starting log period", - "paramRequired": true - }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, - { - "paramName": "npidd", - "paramLongName": "numberOfPiwikIdsToDownload", - "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", - "paramRequired": true - }, - { - "paramName": "nsidd", - "paramLongName": "numberOfSiteIdsToDownload", - "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", - "paramRequired": true - }, - { - "paramName": "lerd", - "paramLongName": "laReferenciaEmptyDirs", - "paramDescription": "Empty LaReferencia directories?", - "paramRequired": true - }, - { - "paramName": "plrl", - "paramLongName": "processLaReferenciaLogs", - "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dlrl", - "paramLongName": "downloadLaReferenciaLogs", - "paramDescription": "download La Referencia logs?", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "irusCreateTablesEmptyDirs", - "paramDescription": "Irus section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "irusDownloadReports", - "paramDescription": "Irus section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "irusProcessStats", - "paramDescription": "Irus section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "irusNumberOfOpendoarsToDownload", - "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "sarcCreateTablesEmptyDirs", - "paramDescription": "Sarc section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "sarcDownloadReports", - "paramDescription": "Sarc section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "sarcProcessStats", - "paramDescription": "Sarc section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "sarcNumberOfIssnToDownload", - "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", - "paramRequired": true - }, - - { - "paramName": "fs", - "paramLongName": "finalizeStats", - "paramDescription": "Create the usage_stats table?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, - { - "paramName": "nodt", - "paramLongName": "numberOfDownloadThreads", - "paramDescription": "Number of download threads", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml deleted file mode 100644 index 8d62a85a9..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml +++ /dev/null @@ -1,90 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - eu.dnetlib.oa.graph.usagestats.export.ExecuteWorkflow - --matomoAuthToken${matomoAuthToken} - --matomoBaseURL${matomoBaseURL} - --repoLogPath${repoLogPath} - --portalLogPath${portalLogPath} - --portalMatomoID${portalMatomoID} - --irusUKBaseURL${irusUKBaseURL} - --irusUKReportPath${irusUKReportPath} - --sarcsReportPathArray${sarcsReportPathArray} - --sarcsReportPathNonArray${sarcsReportPathNonArray} - --lareferenciaLogPath${lareferenciaLogPath} - --lareferenciaBaseURL${lareferenciaBaseURL} - --lareferenciaAuthToken${lareferenciaAuthToken} - --dbHiveUrl${hiveJdbcUrl} - --dbImpalaUrl${impalaJdbcUrl} - --usageStatsDBSchema${usageStatsDBSchema} - --statsDBSchema${statsDBSchema} - --recreateDbAndTables${recreateDbAndTables} - --piwikEmptyDirs${piwikEmptyDirs} - --downloadPiwikLogs${downloadPiwikLogs} - --processPiwikLogs${processPiwikLogs} - --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} - --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} - --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} - --laReferenciaEmptyDirs${laReferenciaEmptyDirs} - --downloadLaReferenciaLogs${downloadLaReferenciaLogs} - --processLaReferenciaLogs${processLaReferenciaLogs} - --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} - --irusDownloadReports${irusDownloadReports} - --irusProcessStats${irusProcessStats} - --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} - --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} - --sarcDownloadReports${sarcDownloadReports} - --sarcProcessStats${sarcProcessStats} - --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} - --finalizeStats${finalizeStats} - --finalTablesVisibleToImpala${finalTablesVisibleToImpala} - --numberOfDownloadThreads${numberOfDownloadThreads} - - - - - - - - From 34d653de41e6264174c6bbecf26b1d356235c2ec Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 Jan 2021 14:16:33 +0100 Subject: [PATCH 07/16] [Cleaning] updated cleaning rule for DOIs --- .../java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 885d761c5..0cbf7219a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -15,7 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { - public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; + public static final String DOI_PREFIX_REGEX = "^10\\."; public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final int ORCID_LEN = 19; @@ -308,7 +308,7 @@ public class CleaningFunctions { // TODO add cleaning for more PID types as needed case "doi": - pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); + pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10.")); break; } return pid; From 646dab7f680f641f9600b231920f1f624b7f2bcc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 Jan 2021 18:24:34 +0100 Subject: [PATCH 08/16] trying to avoid NPEs --- .../src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 0026ee9aa..845c4c982 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable { if (a.size() == b.size()) { int msa = a .stream() - .filter(i -> i.getValue() != null) + .filter(i -> i != null && i.getValue() != null) .map(i -> i.getValue().length()) .max(Comparator.naturalOrder()) .orElse(0); int msb = b .stream() - .filter(i -> i.getValue() != null) + .filter(i -> i != null && i.getValue() != null) .map(i -> i.getValue().length()) .max(Comparator.naturalOrder()) .orElse(0); From 07a0ccfc96d50be6e75909da39730d3d55c7bbf9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 25 Jan 2021 13:36:01 +0100 Subject: [PATCH 09/16] [Cleaning] trying to avoid NPEs --- .../dhp/oa/graph/clean/CleaningFunctions.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 0cbf7219a..42ce7f90b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -59,12 +59,16 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - r.getAuthor().forEach(a -> { - if (Objects.nonNull(a.getPid())) { - a.getPid().forEach(p -> { - fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); - }); - } + r.getAuthor() + .stream() + .filter(Objects::nonNull) + .forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a.getPid() + .stream() + .filter(Objects::nonNull) + .forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)); + } }); } if (value instanceof Publication) { From 3465c8ccee3e2cb53792968264add0e3994fa04f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 25 Jan 2021 16:54:53 +0100 Subject: [PATCH 10/16] [Cleaning] trying to avoid NPEs --- .../dhp/oa/graph/clean/CleaningFunctions.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 42ce7f90b..1d0e35125 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -59,17 +59,19 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - r.getAuthor() - .stream() - .filter(Objects::nonNull) - .forEach(a -> { - if (Objects.nonNull(a.getPid())) { - a.getPid() - .stream() - .filter(Objects::nonNull) - .forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)); - } - }); + r + .getAuthor() + .stream() + .filter(Objects::nonNull) + .forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a + .getPid() + .stream() + .filter(Objects::nonNull) + .forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)); + } + }); } if (value instanceof Publication) { @@ -208,6 +210,7 @@ public class CleaningFunctions { a .getPid() .stream() + .filter(Objects::nonNull) .filter(p -> Objects.nonNull(p.getQualifier())) .filter(p -> StringUtils.isNotBlank(p.getValue())) .map(p -> { From ded6ed8d7d328d2bec562d6438185a5b01289f8a Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 25 Jan 2021 17:57:51 +0100 Subject: [PATCH 11/16] no ',' author, if there are no author in ODF records --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 46 +++---- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 9 ++ .../dhp/oa/graph/raw/textgrid-noauthor.xml | 117 ++++++++++++++++++ 3 files changed, 151 insertions(+), 21 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 6d2e28ba8..9dc0bb44b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -51,31 +51,35 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Node n = (Node) o; final Author author = new Author(); final String fullname = n.valueOf("./datacite:creatorName"); - author.setFullname(fullname); - - final PacePerson pp = new PacePerson(fullname, false); final String name = n.valueOf("./datacite:givenName"); - if (StringUtils.isBlank(name) & pp.isAccurate()) { - author.setName(pp.getNormalisedFirstName()); - } else { - author.setName(name); - } - final String surname = n.valueOf("./datacite:familyName"); - if (StringUtils.isBlank(surname) & pp.isAccurate()) { - author.setSurname(pp.getNormalisedSurname()); - } else { - author.setSurname(surname); - } + if(StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)){ + author.setFullname(fullname); - if (StringUtils.isBlank(author.getFullname())) { - author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); - } + final PacePerson pp = new PacePerson(fullname, false); - author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); - author.setPid(preparePids(n, info)); - author.setRank(pos++); - res.add(author); + if (StringUtils.isBlank(name) & pp.isAccurate()) { + author.setName(pp.getNormalisedFirstName()); + } else { + author.setName(name); + } + + + if (StringUtils.isBlank(surname) & pp.isAccurate()) { + author.setSurname(pp.getNormalisedSurname()); + } else { + author.setSurname(surname); + } + + if (StringUtils.isBlank(author.getFullname())) { + author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); + } + + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); + author.setPid(preparePids(n, info)); + author.setRank(pos++); + res.add(author); + } } return res; } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 2a62e25b2..ae203b3a7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -379,6 +379,15 @@ public class MappersTest { System.out.println(p.getPid().get(0).getValue()); } + @Test + void testTextGridNoAuthor() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid-noauthor.xml")); + final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + } @Test void testBologna() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf-bologna.xml")); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml new file mode 100644 index 000000000..53256bed0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml @@ -0,0 +1,117 @@ + + + + r3f52792889d::00002412cb25f2f3047712d00ab2c8eb + hdl:11858/00-1734-0000-0003-EE73-2 + 2020-12-16T10:04:03.148Z + r3f52792889d + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + 2020-12-16T16:02:37.562Z + + + + hdl:11858/00-1734-0000-0003-EE73-2 + + + + + + + + Auf dem Trocknen + Detlev von Liliencron: Gute Nacht. Hinterlassene Gedichte, Berlin: Schuster & Loeffler, 1909. + + TextGrid + 2012 + + + tvitt@textgrid.de + + + Digitale Bibliothek + TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c + + + + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z + + + + textgrid:rn8z.0 + http://hdl.handle.net/hdl:11858/00-1734-0000-0003-EE73-2 + + + hdl:11858/00-1734-0000-0003-EE72-4 + + + 527 Bytes + + + text/tg.edition+tg.aggregation+xml + + 0 + + Der annotierte Datenbestand der Digitalen Bibliothek inklusive + Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung + des Datenbestandes von www.editura.de durch TextGrid und werden + unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland + Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich + nicht auf die der Annotation zu Grunde liegenden allgemeinfreien + Texte (Siehe auch Punkt 2 der Lizenzbestimmungen). + + + + + + + + Berlin + + + + hdl:11858/00-1734-0000-0003-EE73-2 + 0021 + 0002 + 2012-01-29 + OPEN + http://creativecommons.org/licenses/by/3.0/de/legalcode + und + DE + + + + + + + https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + + + + + false + false + 0.9 + + + + + + From 505477f36fe2be46ca46fceba474d03c5e1c39b8 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 25 Jan 2021 18:02:49 +0100 Subject: [PATCH 12/16] format code --- .../main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java | 3 +-- .../src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 9dc0bb44b..683b37630 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -53,7 +53,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String fullname = n.valueOf("./datacite:creatorName"); final String name = n.valueOf("./datacite:givenName"); final String surname = n.valueOf("./datacite:familyName"); - if(StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)){ + if (StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)) { author.setFullname(fullname); final PacePerson pp = new PacePerson(fullname, false); @@ -64,7 +64,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { author.setName(name); } - if (StringUtils.isBlank(surname) & pp.isAccurate()) { author.setSurname(pp.getNormalisedSurname()); } else { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index ae203b3a7..9827dcc99 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -388,6 +388,7 @@ public class MappersTest { System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); } + @Test void testBologna() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf-bologna.xml")); From cd379eb5e324ade3ee6ff2448e2cb18870bca490 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 25 Jan 2021 18:11:49 +0100 Subject: [PATCH 13/16] [Cleaning] trying to avoid NPEs, this time by ruling out authors without a defined fullname --- .../eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 1d0e35125..7a12a92d6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -190,6 +190,15 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { + r + .setAuthor( + r + .getAuthor() + .stream() + .filter(a -> Objects.nonNull(a)) + .filter(a -> StringUtils.isNotBlank(StringUtils.trim(a.getFullname()))) + .collect(Collectors.toList())); + boolean nullRank = r .getAuthor() .stream() From 2890511613aa80db927617e9564e039ca0c7d569 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 26 Jan 2021 09:41:44 +0100 Subject: [PATCH 14/16] [Cleaning] normalise missing Result.country --- .../dnetlib/dhp/oa/graph/clean/CleaningFunctions.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 7a12a92d6..0d69a448e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -110,6 +110,16 @@ public class CleaningFunctions { .setLanguage( qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); } + if (Objects.nonNull(r.getCountry())) { + r + .setCountry( + r + .getCountry() + .stream() + .filter(Objects::nonNull) + .filter(c -> StringUtils.isNotBlank(c.getClassid())) + .collect(Collectors.toList())); + } if (Objects.nonNull(r.getSubject())) { r .setSubject( From 885e0dd926a50212dbab9f583f1b72d162d94be5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 26 Jan 2021 09:48:53 +0100 Subject: [PATCH 15/16] [Cleaning] filter authors not providing word characters in the fullname --- .../java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 0d69a448e..db7d34ec9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -206,7 +206,8 @@ public class CleaningFunctions { .getAuthor() .stream() .filter(a -> Objects.nonNull(a)) - .filter(a -> StringUtils.isNotBlank(StringUtils.trim(a.getFullname()))) + .filter(a -> StringUtils.isNotBlank(a.getFullname())) + .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) .collect(Collectors.toList())); boolean nullRank = r From f1a852f27848f3ecd5852dc4260627381736ae38 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 26 Jan 2021 15:42:42 +0100 Subject: [PATCH 16/16] align usage-stats workflow poms with latest snapshot version --- .../dhp-usage-raw-data-update/pom.xml | 18 +---------------- dhp-workflows/dhp-usage-stats-build/pom.xml | 20 ++----------------- 2 files changed, 3 insertions(+), 35 deletions(-) diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 44f28ff56..a78f92d41 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -1,25 +1,9 @@ - - - - - dhp-workflows eu.dnetlib.dhp - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-usage-raw-data-update diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index 79fabb603..20d2f5b76 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -1,29 +1,13 @@ - - - - - dhp-workflows eu.dnetlib.dhp - 1.1.7-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-usage-stats-build - + pl.project13.maven