diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml b/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml deleted file mode 100644 index a65c4514a..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/nb-configuration.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - JDK_1.8 - - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml b/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml deleted file mode 100644 index 5593d4d87..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/pom.xml +++ /dev/null @@ -1,112 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-usage-datasets-stats-update - - - - pl.project13.maven - git-commit-id-plugin - 2.1.15 - - - - revision - - - - - ${project.basedir}/../.git - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - 1.8 - 1.8 - - - - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - com.mchange - c3p0 - 0.9.5.2 - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - dhp-usage-datasets-stats-update - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh b/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh deleted file mode 100755 index 9b4325508..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/runworkflow.sh +++ /dev/null @@ -1 +0,0 @@ -mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/datasetsusagestats \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java deleted file mode 100644 index 25b30e8ad..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ConnectDB.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.Properties; - -import org.apache.log4j.Logger; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import com.mchange.v2.c3p0.ComboPooledDataSource; - -public abstract class ConnectDB { - - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; - - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String datasetUsageStatsDBSchema; - private static String statsDBSchema; - private final static Logger logger = Logger.getLogger(ConnectDB.class); - private Statement stmt = null; - - static void init() throws ClassNotFoundException { - - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - datasetUsageStatsDBSchema = ExecuteWorkflow.datasetUsageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } - - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); - - return DB_HIVE_CONNECTION; - } - } - - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); - - return DB_IMPALA_CONNECTION; - } - } - - public static String getDataSetUsageStatsDBSchema() { - return ConnectDB.datasetUsageStatsDBSchema; - } - - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } - - private static Connection connectHive() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setUser("dimitris.pierrakos"); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - - return cpds.getConnection(); - - } - - private static Connection connectImpala() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setUser("dimitris.pierrakos"); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - logger.info("Opened database successfully"); - return cpds.getConnection(); - - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java deleted file mode 100644 index 88db1f819..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DatasetsStatsDB.java +++ /dev/null @@ -1,168 +0,0 @@ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.sql.Connection; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.*; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class DatasetsStatsDB { - - private String logPath; - private String logRepoPath; - private String logPortalPath; - - private Statement stmt = null; - - private static final Logger logger = LoggerFactory.getLogger(DatasetsStatsDB.class); - - private String CounterRobotsURL; - private ArrayList robotsList; - - public DatasetsStatsDB(String logRepoPath, String logPortalPath) throws Exception { - this.logRepoPath = logRepoPath; - this.logPortalPath = logPortalPath; - - } - - public void recreateDBAndTables() throws Exception { - this.createDatabase(); - this.createTables(); - } - -// public void reCreateLogDirs() throws IllegalArgumentException, IOException { -// FileSystem dfs = FileSystem.get(new Configuration()); -// -// logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); -// dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); -// -// logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); -// dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); -// -// logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); -// dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); -// -// logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); -// dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); -// } - public ArrayList getRobotsList() { - return robotsList; - } - - public void setRobotsList(ArrayList robotsList) { - this.robotsList = robotsList; - } - - public String getCounterRobotsURL() { - return CounterRobotsURL; - } - - public void setCounterRobotsURL(String CounterRobotsURL) { - this.CounterRobotsURL = CounterRobotsURL; - } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping datasets DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getDataSetUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Reports table - This table should exist - logger.info("Creating Reports Table"); - String sqlCreateTableDataciteReports = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports(reportid STRING, \n" - + " name STRING, \n" - + " source STRING,\n" - + " release STRING,\n" - + " createdby STRING,\n" - + " report_start_date STRING,\n" - + " report_end_date STRING)\n" - + " CLUSTERED BY (reportid)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - - stmt.executeUpdate(sqlCreateTableDataciteReports); - logger.info("Reports Table Created"); - - // Create Datasets Table - logger.info("Creating DataSets Table"); - String sqlCreateTableDataSets = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasets(ds_type STRING,\n" - + " ds_title STRING,\n" - + " yop STRING,\n" - + " uri STRING,\n" - + " platform STRING,\n" - + " data_type STRING,\n" - + " publisher STRING,\n" - + " publisher_id_type STRING,\n" - + " publisher_id_value STRING,\n" - + " ds_dates_type STRING,\n" - + " ds_pub_date STRING,\n" - + " ds_contributors STRING,\n" - // + " ds_contributor_value array ,\n" - + " reportid STRING)\n" - + " CLUSTERED BY (ds_type)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableDataSets); - logger.info("DataSets Table Created"); - - // Create Datasets Performance Table - logger.info("Creating DataSetsPerformance Table"); - String sqlCreateTableDataSetsPerformance = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datasetsperformance(ds_type STRING,\n" - + " period_end STRING,\n" - + " period_from STRING,\n" - + " access_method STRING,\n" - + " metric_type STRING,\n" - + " count INT,\n" - + " country_counts STRING,\n" - + " reportid STRING)\n" - + " CLUSTERED BY (ds_type)\n" - + " into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableDataSetsPerformance); - logger.info("DataSetsPerformance Table Created"); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - private Connection getConnection() throws SQLException { - return ConnectDB.getHiveConnection(); - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java deleted file mode 100644 index a73b299ec..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/DownloadReportsListFromDatacite.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; - -/** - * @author dpie - */ -public class DownloadReportsListFromDatacite { - - private String dataciteBaseURL; - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public DownloadReportsListFromDatacite(String dataciteBaseURL, String dataciteReportPath) - throws MalformedURLException, Exception { - - this.dataciteBaseURL = dataciteBaseURL; - this.dataciteReportPath = dataciteReportPath; - } - - public void downloadReportsList() throws ParseException { - StringBuilder responseStrBuilder = new StringBuilder(); - - Gson gson = new Gson(); - - try { - BufferedInputStream in = new BufferedInputStream(new URL(dataciteBaseURL).openStream()); - logger.info("Downloading from " + dataciteBaseURL); - - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder.append(inputStr); - } - } catch (IOException e) { - logger.info(e.getMessage()); - } - JsonObject jsonObject = gson.fromJson(responseStrBuilder.toString(), JsonObject.class); - JsonArray dataArray = jsonObject.getAsJsonArray("reports"); - ArrayList reportsList = new ArrayList(); - for (JsonElement element : dataArray) { - reportsList.add(element.getAsJsonObject().get("id").getAsString()); - } - - Iterator it = reportsList.iterator(); - while (it.hasNext()) { - String reportId = it.next().toString(); - String url = dataciteBaseURL + reportId; - - try { - BufferedInputStream in = new BufferedInputStream(new URL(url).openStream()); - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); - String inputStr; - StringBuilder responseStrBuilder2 = new StringBuilder(); - while ((inputStr = streamReader.readLine()) != null) { - responseStrBuilder2.append(inputStr); - } - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(dataciteReportPath + "/" + reportId + ".json"), - true); - byte[] jsonObjectRawBytes = responseStrBuilder2.toString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - fin.close(); - - fin.close(); - } catch (IOException e) { - System.out.println(e); - } - } - } -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java deleted file mode 100644 index b28578e4b..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ExecuteWorkflow.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class ExecuteWorkflow { - - static String dataciteBaseURL; - static String dataciteReportPath; - static String dbHiveUrl; - static String dbImpalaUrl; - static String datasetUsageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - static boolean datasetsEmptyDirs; - static boolean finalTablesVisibleToImpala; - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json"))); - parser.parseArgument(args); - - // Setting up the initial parameters - dataciteBaseURL = parser.get("dataciteBaseURL"); - dataciteReportPath = parser.get("dataciteReportPath"); - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - datasetUsageStatsDBSchema = parser.get("datasetUsageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); - - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) - recreateDbAndTables = true; - else - recreateDbAndTables = false; - - if (parser.get("datasetsEmptyDirs").toLowerCase().equals("true")) - datasetsEmptyDirs = true; - else - datasetsEmptyDirs = false; - -// if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) -// finalTablesVisibleToImpala = true; -// else -// finalTablesVisibleToImpala = false; -// - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - -} diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java deleted file mode 100644 index ccb3eebd3..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java +++ /dev/null @@ -1,408 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.*; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.sql.Array; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.ArrayList; -import java.util.Base64; -import java.util.Iterator; -import java.util.Set; -import java.util.zip.GZIPInputStream; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; - -/** - * @author dpie - */ -public class ReadReportsListFromDatacite { - - private String dataciteReportPath; - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - public ReadReportsListFromDatacite(String dataciteReportPath) throws MalformedURLException, Exception { - - this.dataciteReportPath = dataciteReportPath; - } - - public void readReports() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - File folder = new File(dataciteReportPath); - ArrayList jsonFiles = listHdfsDir(dataciteReportPath); - for (String jsonFile : jsonFiles) { - logger.info("Reading report file " + jsonFile); - this.createTmpReportsTable(jsonFile); - - String sqlSelectReportID = "SELECT get_json_object(json, '$.report.id') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelectReportID); - ResultSet rstmpReportID = stmt.getResultSet(); - - String reportID = null; - while (rstmpReportID.next()) { - reportID = rstmpReportID.getString(1); - } - - logger.info("Checking report with id " + reportID); - String sqlCheckIfReportExists = "SELECT source FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".datacitereports where reportid=?"; - PreparedStatement stGetReportID = ConnectDB.getHiveConnection().prepareStatement(sqlCheckIfReportExists); - stGetReportID.setString(1, reportID); - - ResultSet rsCheckIfReportExist = stGetReportID.executeQuery(); - - if (rsCheckIfReportExist.next()) { - logger.info("Report found with ID " + reportID); - dropTmpReportsTable(); - } else { - String sqlInsertReport = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datacitereports " - + "SELECT\n" - + " get_json_object(json, '$.report.id') AS reportid,\n" - + " get_json_object(json, '$.report.report-header.report-name') AS name,\n" - + " get_json_object(json, '$.report.report-header.report-id') AS source,\n" - + " get_json_object(json, '$.report.report-header.release') AS release,\n" - + " get_json_object(json, '$.report.report-header.created-by\') AS createdby,\n" - + " get_json_object(json, '$.report.report-header.reporting-period.begin-date') AS fromdate,\n" - + " get_json_object(json, '$.report.report-header.reporting-period.end-date') AS todate \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertReport); - - logger.info("Report added"); - - logger.info("Adding datasets"); - String sqlSelecteDatasetsArray = "SELECT get_json_object(json, '$.report.report-datasets') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsArray); - ResultSet rstmpReportDatasets = stmt.getResultSet(); - - if (rstmpReportDatasets.next() && rstmpReportDatasets.getString(1).indexOf(',') > 0) { - String[] listDatasets = rstmpReportDatasets.getString(1).split(","); - logger.info("Datasets found " + listDatasets.length); - - for (int i = 0; i < listDatasets.length; i++) { - - String sqlInsertDataSets = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datasets " - + "SELECT\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-id[0].value') AS ds_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-title') AS ds_title,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].yop') AS yop,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].uri') AS uri,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].platform') AS platform,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].data-type') AS data_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i + "].publisher') AS publisher,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].publisher-id.type[0]') AS publisher_id_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].publisher-id.value[0]') AS publisher_id_value,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-dates.type[0]') AS ds_dates_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-dates.value[0]') AS ds_dates_value,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-contributors') AS ds_contributors,\n" - + " get_json_object(json, '$.report.id') AS reportid \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertDataSets); - - logger.info("Dataset added " + i); - - logger.info("Adding Dataset Performance"); - String sqlSelecteDatasetsPerformance = "SELECT get_json_object(json, '$.report.report-datasets[" - + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsPerformance); - ResultSet rstmpReportDatasetsPerformance = stmt.getResultSet(); - if (rstmpReportDatasetsPerformance.next() - && rstmpReportDatasetsPerformance.getString(1).indexOf(',') > 0) { - String[] listDatasetsPerformance = rstmpReportDatasetsPerformance.getString(1).split(","); - logger.info("Datasets Performance found " + listDatasetsPerformance.length); - for (int j = 0; j < listDatasetsPerformance.length; j++) { - String sqlSelecteDatasetsPerformanceInstance = "SELECT get_json_object(json, '$.report.report-datasets[" - + i + "].performance') FROM " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".tmpjson"; - stmt.execute(sqlSelecteDatasetsPerformanceInstance); - ResultSet rstmpReportDatasetsPerformanceInstance = stmt.getResultSet(); - if (rstmpReportDatasetsPerformanceInstance.next() - && rstmpReportDatasetsPerformanceInstance.getString(1).indexOf(',') > 0) { - String[] listDatasetsPerformanceInstance = rstmpReportDatasetsPerformanceInstance - .getString(1) - .split(","); - logger.info("Datasets Performance found " + listDatasetsPerformanceInstance.length); - for (int k = 0; k < listDatasetsPerformanceInstance.length; k++) { - String sqlInsertDataSetsPerformance = "INSERT INTO " - + ConnectDB.getDataSetUsageStatsDBSchema() + " .datasetsperformance " - + "SELECT\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].dataset-id[0].value') AS ds_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].period.end-date') AS period_end,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].period.begin-date') AS period_from,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].access-method') AS access_method,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].metric-type') AS metric_type,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k + "].count') AS count,\n" - + " get_json_object(json, '$.report.report-datasets[" + i - + "].performance[" + j + "].instance[" + k - + "].country-counts') AS country_counts,\n" - + " get_json_object(json, '$.report.id') AS reportid \n" - + "FROM " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlInsertDataSetsPerformance); - } - } - } - } - logger.info("DatasetPerformance added for dataset" + i); - } - } - logger.info("Adding gzip performance"); - String sqlSelecteReportSubsets = "SELECT get_json_object(json, '$.report.report-subsets.gzip[0]') FROM " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(sqlSelecteReportSubsets); - ResultSet rstmpReportSubsets = stmt.getResultSet(); - if (rstmpReportSubsets.next()) { - String unCompressedReport = uncompressString(rstmpReportSubsets.getString(1)); - this.readCompressedReport(unCompressedReport, reportID); - } - } - } - this.dropTmpReportsTable(); - } - - public void readCompressedReport(String report, String reportId) throws Exception { - Gson gson = new Gson(); - JsonObject jsonObject = gson.fromJson(report, JsonObject.class); - - JsonArray jsonReportDatasets; - if (jsonObject.getAsJsonArray("report_datasets") != null) { - jsonReportDatasets = jsonObject.getAsJsonArray("report_datasets"); - } else { - jsonReportDatasets = jsonObject.getAsJsonArray("report-datasets"); - } - - for (JsonElement datasetElement : jsonReportDatasets) { - // JsonElement dataset_title = datasetElement.getAsJsonObject().get("dataset-title"); - String dataset_title = datasetElement.getAsJsonObject().get("dataset-title").getAsString(); - String yop = datasetElement.getAsJsonObject().get("yop").getAsString(); - String uri = datasetElement.getAsJsonObject().get("uri").getAsString(); - String platform = datasetElement.getAsJsonObject().get("platform").getAsString(); - String data_type = datasetElement.getAsJsonObject().get("data-type").getAsString(); - String publisher = datasetElement.getAsJsonObject().get("publisher").getAsString(); - - JsonArray publisher_id = datasetElement.getAsJsonObject().getAsJsonArray("publisher-id"); - String publisher_id_type = ""; - String publisher_id_value = ""; - for (JsonElement publisher_id_Element : publisher_id) { - publisher_id_type = publisher_id_Element.getAsJsonObject().get("type").getAsString(); - publisher_id_value = publisher_id_Element.getAsJsonObject().get("value").getAsString(); - } - JsonArray dataset_days = datasetElement.getAsJsonObject().getAsJsonArray("dataset-dates"); - String ds_dates_type = ""; - String ds_dates_value = ""; - for (JsonElement datasetDaysElement : dataset_days) { - ds_dates_type = datasetDaysElement.getAsJsonObject().get("type").getAsString(); - ds_dates_value = datasetDaysElement.getAsJsonObject().get("value").getAsString(); - } - - JsonArray datasetContributors = null; - String ds_contributor_type = ""; - String[] ds_contributor_values = null; - Array ds_contributor_valuesArr = null; - - if (datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors") != null) { - datasetContributors = datasetElement.getAsJsonObject().getAsJsonArray("dataset-contributors"); - - JsonArray datasetid = datasetElement.getAsJsonObject().getAsJsonArray("dataset-id"); - String doi = ""; - for (JsonElement datasetIDElement : datasetid) -//System.out.println(datasetIDElement.getAsJsonObject().get("value").getAsString()); - { - doi = datasetIDElement.getAsJsonObject().get("value").getAsString(); - } - - String sqlInsertDataset = "INSERT INTO " + ConnectDB.getDataSetUsageStatsDBSchema() - + " .datasets(ds_type," - + "ds_title,yop,uri,platform,data_type,publisher,publisher_id_type,publisher_id_value," - + "ds_dates_type, ds_dates_value, ds_contributors,reportid) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?) "; - - PreparedStatement pstmtDataset = ConnectDB.DB_HIVE_CONNECTION.prepareStatement(sqlInsertDataset); - - pstmtDataset.setString(1, doi); - pstmtDataset.setString(2, dataset_title); - pstmtDataset.setString(3, yop); - pstmtDataset.setString(4, uri); - pstmtDataset.setString(5, platform); - pstmtDataset.setString(6, data_type); - pstmtDataset.setString(7, publisher); - pstmtDataset.setString(8, publisher_id_type); - pstmtDataset.setString(9, publisher_id_value); - pstmtDataset.setString(10, ds_dates_type); - pstmtDataset.setString(11, ds_dates_value); - pstmtDataset.setString(13, datasetContributors.getAsString()); - pstmtDataset.setString(14, reportId); - - pstmtDataset.execute(); - logger.info("Dataset from compressed report addded " + doi); - /* - * JsonArray performance = datasetElement.getAsJsonObject().getAsJsonArray("performance"); for - * (JsonElement performanceElement : performance) { JsonObject period = - * performanceElement.getAsJsonObject().getAsJsonObject("period"); String end_date = - * period.getAsJsonObject().get("end-date").getAsString(); String begin_date = - * period.getAsJsonObject().get("begin-date").getAsString(); JsonArray instance = - * performanceElement.getAsJsonObject().getAsJsonArray("instance"); for (JsonElement instanceElement : - * instance) { int count = instanceElement.getAsJsonObject().get("count").getAsInt(); JsonObject - * country_counts = instanceElement.getAsJsonObject().getAsJsonObject("country-counts"); Set - * keys = country_counts.keySet(); String[] country = new String[country_counts.size()]; String[] - * country_counts_val = new String[country_counts.size()]; Iterator it2 = keys.iterator(); int j = 0; - * while (it2.hasNext()) { country[j] = it2.next().toString(); country_counts_val[j] = - * country_counts.get(country[j]).getAsString(); } Array countryArr = conn.createArrayOf("text", - * country); Array countrycountsArr = conn.createArrayOf("text", country_counts_val); String metrictype - * = instanceElement.getAsJsonObject().get("metric-type").getAsString(); String accessMethod = - * instanceElement.getAsJsonObject().get("access-method").getAsString(); String - * sqlInsertDatasetPerformance = - * "INSERT INTO datasetperformance(ds_type,period_end,period_from,access_method,metric_type,count,country,country_count, reportid) VALUES(?,?,?,?,?,?,?,?,?)" - * ; PreparedStatement pstmtDatasetPerformance = conn.prepareStatement(sqlInsertDatasetPerformance); - * //System.out.println(begin_date + " " + end_date + " " + doi + " " + metrictype + " " + count); - * pstmtDatasetPerformance.setString(1, doi); pstmtDatasetPerformance.setString(2, end_date); - * pstmtDatasetPerformance.setString(3, begin_date); pstmtDatasetPerformance.setString(4, accessMethod); - * pstmtDatasetPerformance.setString(5, metrictype); pstmtDatasetPerformance.setInt(6, count); - * pstmtDatasetPerformance.setArray(7, countryArr); pstmtDatasetPerformance.setArray(8, - * countrycountsArr); pstmtDatasetPerformance.setString(9, reportId); pstmtDatasetPerformance.execute(); - * } } - */ - } - } - - } - - private ArrayList listHdfsDir(String dir) throws Exception { - - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - fileNames.add(fileName); - } - - hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + dir)); - throw new Exception("HDFS file path with exported data does not exist : " + dir, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - sb.append(line); - // sb.append(line); - line = br.readLine(); - } - // result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - result = sb.toString().trim(); - // fs.close(); - } catch (Exception e) { - throw new Exception(e); - } - - return result; - } - - public static String uncompressString(String zippedBase64Str) - throws IOException { - String result = null; - - // In my solr project, I use org.apache.solr.common.util.Base64. - // byte[] bytes = - // org.apache.solr.common.util.Base64.base64ToByteArray(zippedBase64Str); - byte[] bytes = Base64.getDecoder().decode(zippedBase64Str); - GZIPInputStream zi = null; - try { - zi = new GZIPInputStream(new ByteArrayInputStream(bytes)); - result = IOUtils.toString(zi); - } finally { - IOUtils.closeQuietly(zi); - } - return result; - } - - private void createTmpReportsTable(String jsonFile) throws SQLException { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - dropTmpReportsTable(); - String createTmpTable = "CREATE TEMPORARY TABLE " + ConnectDB.getDataSetUsageStatsDBSchema() - + ".tmpjson (json STRING)"; - stmt.executeUpdate(createTmpTable); - logger.info("Tmp Table Created"); - - String insertJsonReport = "LOAD DATA INPATH '" + jsonFile + "' INTO TABLE " - + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - stmt.execute(insertJsonReport); - logger.info("JSON Report File inserted to tmpjson Table"); - } - - private void dropTmpReportsTable() throws SQLException { - logger.info("Dropping tmpjson Table"); - String dropTmpTable = "DROP TABLE IF EXISTS " + ConnectDB.getDataSetUsageStatsDBSchema() + ".tmpjson"; - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - stmt.executeUpdate(dropTmpTable); - logger.info("Dropped tmpjson Table"); - - } - -} - -/* - * PreparedStatement prepStatem = conn. - * prepareStatement("insert into usageStats (source, entityID,sourceItemType,entityType, counter,action,timestamp_month,referrer) values (?,?,?,?,?,?,?,?)" - * ); - */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java deleted file mode 100644 index 7b07fbc25..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/UsageStatsExporter.java +++ /dev/null @@ -1,111 +0,0 @@ - -package eu.dnetlib.oa.graph.datasetsusagestats.export; - -import java.io.IOException; -import java.sql.SQLException; -import java.sql.Statement; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Main class for downloading and processing Usage statistics - * - * @author D. Pierrakos, S. Zoupanos - */ -public class UsageStatsExporter { - - private Statement stmt = null; - - public UsageStatsExporter() { - - } - - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.delete(new Path(ExecuteWorkflow.dataciteReportPath), true); - - logger.info("Creating Log directory: " + ExecuteWorkflow.dataciteReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.dataciteReportPath)); - - } - - public void export() throws Exception { - - logger.info("Initialising DB properties"); - ConnectDB.init(); - ConnectDB.getHiveConnection(); - - if (ExecuteWorkflow.recreateDbAndTables) { - DatasetsStatsDB datasetsDB = new DatasetsStatsDB("", ""); - datasetsDB.recreateDBAndTables(); - } - logger.info("Initializing the download logs module"); - DownloadReportsListFromDatacite drfd = new DownloadReportsListFromDatacite(ExecuteWorkflow.dataciteBaseURL, - ExecuteWorkflow.dataciteReportPath); - - if (ExecuteWorkflow.datasetsEmptyDirs) { - logger.info("Downloading Reports List From Datacite"); - drfd.downloadReportsList(); - logger.info("Reports List has been downloaded"); - } - - ReadReportsListFromDatacite readReportsListFromDatacite = new ReadReportsListFromDatacite( - ExecuteWorkflow.dataciteReportPath); - logger.info("Store Reports To DB"); - readReportsListFromDatacite.readReports(); - logger.info("Reports Stored To DB"); - } - -// runImpalaQuery(); - /* - * PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - * logger.info("Re-creating database and tables"); logger.info("Initializing the download logs module"); - * PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - * if (ExecuteWorkflow.piwikEmptyDirs) { logger.info("Recreating Piwik log directories"); - * piwikstatsdb.reCreateLogDirs(); } // Downloading piwik logs (also managing directory creation) if - * (ExecuteWorkflow.downloadPiwikLogs) { logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs( - * ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); } - * logger.info("Downloaded piwik logs"); // Create DB tables, insert/update statistics String cRobotsUrl = - * "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - * piwikstatsdb.setCounterRobotsURL(cRobotsUrl); if (ExecuteWorkflow.processPiwikLogs) { - * logger.info("Processing logs"); piwikstatsdb.processLogs(); } logger.info("Creating LaReferencia tables"); - * LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - * ExecuteWorkflow.lareferenciaAuthToken); if (ExecuteWorkflow.laReferenciaEmptyDirs) { - * logger.info("Recreating LaReferencia log directories"); lrf.reCreateLogDirs(); } if - * (ExecuteWorkflow.downloadLaReferenciaLogs) { logger.info("Downloading LaReferencia logs"); - * lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); } - * LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); if - * (ExecuteWorkflow.processLaReferenciaLogs) { logger.info("Processing LaReferencia logs"); lastats.processLogs(); - * logger.info("LaReferencia logs done"); } IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); if - * (ExecuteWorkflow.irusCreateTablesEmptyDirs) { logger.info("Creating Irus Stats tables"); - * irusstats.createTables(); logger.info("Created Irus Stats tables"); logger.info("Re-create log dirs"); - * irusstats.reCreateLogDirs(); logger.info("Re-created log dirs"); } if (ExecuteWorkflow.irusDownloadReports) { - * irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); } if (ExecuteWorkflow.irusProcessStats) { - * irusstats.processIrusStats(); logger.info("Irus done"); } SarcStats sarcStats = new SarcStats(); if - * (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { sarcStats.reCreateLogDirs(); } if - * (ExecuteWorkflow.sarcDownloadReports) { sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, - * ExecuteWorkflow.sarcsReportPathNonArray); } if (ExecuteWorkflow.sarcProcessStats) { - * sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - * sarcStats.finalizeSarcStats(); } logger.info("Sarc done"); // finalize usagestats if - * (ExecuteWorkflow.finalizeStats) { piwikstatsdb.finalizeStats(); logger.info("Finalized stats"); } // Make the - * tables available to Impala if (ExecuteWorkflow.finalTablesVisibleToImpala) { - * logger.info("Making tables visible to Impala"); invalidateMetadata(); } logger.info("End"); - */ -} -/* - * private void invalidateMetadata() throws SQLException { Statement stmt = null; stmt = - * ConnectDB.getImpalaConnection().createStatement(); String sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " - * + ConnectDB.getDataSetUsageStatsDBSchema() + ".views_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".usage_stats"; stmt.executeUpdate(sql); sql = "INVALIDATE METADATA " + - * ConnectDB.getDataSetUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(sql); stmt.close(); - * ConnectDB.getHiveConnection().close(); } - */ diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json deleted file mode 100644 index f8d51a882..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/export/datasets_usagestats_parameters.json +++ /dev/null @@ -1,56 +0,0 @@ -[ - { - "paramName": "dbu", - "paramLongName": "dataciteBaseURL", - "paramDescription": "URL of Datacite Reports Endpoint", - "paramRequired": true - }, - { - "paramName": "drp", - "paramLongName": "dataciteReportPath", - "paramDescription": "Path for Datacite Reports", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dusdbs", - "paramLongName": "datasetUsageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "datasetsEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the dataset_usage_stats, visible to Impala", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml deleted file mode 100644 index 3a81e497d..000000000 --- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/datasetsusagestats/oozie_app/workflow.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - eu.dnetlib.oa.graph.datasetsusagestats.export.ExecuteWorkflow - --dataciteBaseURL - ${dataciteBaseURL} - --dataciteReportPath - ${dataciteReportPath} - --dbHiveUrl - ${hiveJdbcUrl} - --dbImpalaUrl - ${impalaJdbcUrl} - --datasetUsageStatsDBSchema - ${datasetUsageStatsDBSchema} - --statsDBSchema - ${statsDBSchema} - --recreateDbAndTables - ${recreateDbAndTables} - --datasetsEmptyDirs - ${datasetsEmptyDirs} - --finalTablesVisibleToImpala - ${finalTablesVisibleToImpala} - - - - - - - - diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml deleted file mode 100644 index b56257ee5..000000000 --- a/dhp-workflows/dhp-usage-stats-update/pom.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - dhp-workflows - eu.dnetlib.dhp - 1.1.7-SNAPSHOT - - 4.0.0 - dhp-usage-stats-update - - - UTF-8 - UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 - - - - - org.apache.spark - spark-core_2.11 - 2.2.0 - - - org.apache.spark - spark-sql_2.11 - 2.4.5 - - - com.googlecode.json-simple - json-simple - 1.1.1 - - - org.json - json - 20180130 - jar - - - org.apache.hive - hive-jdbc - ${cdh.hive.version} - - - org.apache.hadoop - hadoop-common - ${cdh.hadoop.version} - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - c3p0 - c3p0 - 0.9.1.2 - jar - - - diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java deleted file mode 100644 index 29dd5648b..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ConnectDB.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.Properties; - -import org.apache.log4j.Logger; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import com.mchange.v2.c3p0.ComboPooledDataSource; - -public abstract class ConnectDB { - - public static Connection DB_HIVE_CONNECTION; - public static Connection DB_IMPALA_CONNECTION; - - private static String dbHiveUrl; - private static String dbImpalaUrl; - private static String usageStatsDBSchema; - private static String statsDBSchema; - private final static Logger log = Logger.getLogger(ConnectDB.class); - - static void init() throws ClassNotFoundException { - - dbHiveUrl = ExecuteWorkflow.dbHiveUrl; - dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; - usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; - statsDBSchema = ExecuteWorkflow.statsDBSchema; - - Class.forName("org.apache.hive.jdbc.HiveDriver"); - } - - public static Connection getHiveConnection() throws SQLException { - if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { - return DB_HIVE_CONNECTION; - } else { - DB_HIVE_CONNECTION = connectHive(); - - return DB_HIVE_CONNECTION; - } - } - - public static Connection getImpalaConnection() throws SQLException { - if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { - return DB_IMPALA_CONNECTION; - } else { - DB_IMPALA_CONNECTION = connectImpala(); - - return DB_IMPALA_CONNECTION; - } - } - - public static String getUsageStatsDBSchema() { - return ConnectDB.usageStatsDBSchema; - } - - public static String getStatsDBSchema() { - return ConnectDB.statsDBSchema; - } - - private static Connection connectHive() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbHiveUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - return cpds.getConnection(); - - } - - private static Connection connectImpala() throws SQLException { - /* - * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = - * connection.createStatement(); log.debug("Opened database successfully"); return connection; - */ - ComboPooledDataSource cpds = new ComboPooledDataSource(); - cpds.setJdbcUrl(dbImpalaUrl); - cpds.setAcquireIncrement(1); - cpds.setMaxPoolSize(100); - cpds.setMinPoolSize(1); - cpds.setInitialPoolSize(1); - cpds.setMaxIdleTime(300); - cpds.setMaxConnectionAge(36000); - - cpds.setAcquireRetryAttempts(5); - cpds.setAcquireRetryDelay(2000); - cpds.setBreakAfterAcquireFailure(false); - - cpds.setCheckoutTimeout(0); - cpds.setPreferredTestQuery("SELECT 1"); - cpds.setIdleConnectionTestPeriod(60); - - return cpds.getConnection(); - - } - -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java deleted file mode 100644 index 50b951cbc..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.io.IOUtils; -import org.apache.log4j.BasicConfigurator; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class ExecuteWorkflow { - - static String matomoAuthToken; - static String matomoBaseURL; - static String repoLogPath; - static String portalLogPath; - static String portalMatomoID; - static String irusUKBaseURL; - static String irusUKReportPath; - static String sarcsReportPathArray; - static String sarcsReportPathNonArray; - static String lareferenciaLogPath; - static String lareferenciaBaseURL; - static String lareferenciaAuthToken; - static String dbHiveUrl; - static String dbImpalaUrl; - static String usageStatsDBSchema; - static String statsDBSchema; - static boolean recreateDbAndTables; - - static boolean piwikEmptyDirs; - static boolean downloadPiwikLogs; - static boolean processPiwikLogs; - - static Calendar startingLogPeriod; - static Calendar endingLogPeriod; - static int numberOfPiwikIdsToDownload; - static int numberOfSiteIdsToDownload; - - static boolean laReferenciaEmptyDirs; - static boolean downloadLaReferenciaLogs; - static boolean processLaReferenciaLogs; - - static boolean irusCreateTablesEmptyDirs; - static boolean irusDownloadReports; - static boolean irusProcessStats; - static int irusNumberOfOpendoarsToDownload; - - static boolean sarcCreateTablesEmptyDirs; - static boolean sarcDownloadReports; - static boolean sarcProcessStats; - static int sarcNumberOfIssnToDownload; - - static boolean finalizeStats; - static boolean finalTablesVisibleToImpala; - - static int numberOfDownloadThreads; - - public static void main(String args[]) throws Exception { - - // Sending the logs to the console - BasicConfigurator.configure(); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - UsageStatsExporter.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json"))); - parser.parseArgument(args); - - // Setting up the initial parameters - matomoAuthToken = parser.get("matomoAuthToken"); - matomoBaseURL = parser.get("matomoBaseURL"); - repoLogPath = parser.get("repoLogPath"); - portalLogPath = parser.get("portalLogPath"); - portalMatomoID = parser.get("portalMatomoID"); - irusUKBaseURL = parser.get("irusUKBaseURL"); - irusUKReportPath = parser.get("irusUKReportPath"); - sarcsReportPathArray = parser.get("sarcsReportPathArray"); - sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); - lareferenciaLogPath = parser.get("lareferenciaLogPath"); - lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); - lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); - - dbHiveUrl = parser.get("dbHiveUrl"); - dbImpalaUrl = parser.get("dbImpalaUrl"); - usageStatsDBSchema = parser.get("usageStatsDBSchema"); - statsDBSchema = parser.get("statsDBSchema"); - - if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) - recreateDbAndTables = true; - else - recreateDbAndTables = false; - - if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) - piwikEmptyDirs = true; - else - piwikEmptyDirs = false; - - if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) - downloadPiwikLogs = true; - else - downloadPiwikLogs = false; - - if (parser.get("processPiwikLogs").toLowerCase().equals("true")) - processPiwikLogs = true; - else - processPiwikLogs = false; - - String startingLogPeriodStr = parser.get("startingLogPeriod"); - Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); - startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); - - String endingLogPeriodStr = parser.get("endingLogPeriod"); - Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); - endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); - - numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); - numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); - - if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) - laReferenciaEmptyDirs = true; - else - laReferenciaEmptyDirs = false; - - if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) - downloadLaReferenciaLogs = true; - else - downloadLaReferenciaLogs = false; - - if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) - processLaReferenciaLogs = true; - else - processLaReferenciaLogs = false; - - if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) - irusCreateTablesEmptyDirs = true; - else - irusCreateTablesEmptyDirs = false; - if (parser.get("irusDownloadReports").toLowerCase().equals("true")) - irusDownloadReports = true; - else - irusDownloadReports = false; - if (parser.get("irusProcessStats").toLowerCase().equals("true")) - irusProcessStats = true; - else - irusProcessStats = false; - irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); - - if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) - sarcCreateTablesEmptyDirs = true; - else - sarcCreateTablesEmptyDirs = false; - if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) - sarcDownloadReports = true; - else - sarcDownloadReports = false; - if (parser.get("sarcProcessStats").toLowerCase().equals("true")) - sarcProcessStats = true; - else - sarcProcessStats = false; - sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); - - if (parser.get("finalizeStats").toLowerCase().equals("true")) - finalizeStats = true; - else - finalizeStats = false; - if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) - finalTablesVisibleToImpala = true; - else - finalTablesVisibleToImpala = false; - - numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); - - UsageStatsExporter usagestatsExport = new UsageStatsExporter(); - usagestatsExport.export(); - } - - private static Calendar startingLogPeriodStr(Date date) { - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(date); - return calendar; - - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java deleted file mode 100644 index 011c90532..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java +++ /dev/null @@ -1,419 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class IrusStats { - - private String irusUKURL; - - private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); - - public IrusStats(String irusUKURL) throws Exception { - this.irusUKURL = irusUKURL; - // The following may not be needed - It will be created when JSON tables are created -// createTmpTables(); - } - - public void reCreateLogDirs() throws Exception { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - } - - public void createTables() throws Exception { - try { - logger.info("Creating sushilog"); - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog(source STRING, " - + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " - + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableSushiLog); - logger.info("Created sushilog"); - - // To see how to apply to the ignore duplicate rules and indexes -// stmt.executeUpdate(sqlCreateTableSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilog " -// + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," -// + "sushilog.rid, sushilog.date " -// + "FROM sushilog " -// + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; -// stmt.executeUpdate(createSushiIndex); - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - -// // The following may not be needed - It will be created when JSON tables are created -// private void createTmpTables() throws Exception { -// try { -// -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; -// stmt.executeUpdate(sqlCreateTableSushiLog); -// -// // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); -// // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; -// // stmt.executeUpdate(sqlCopyPublicSushiLog); -// String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO sushilogtmp " -// + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," -// + "sushilogtmp.rid, sushilogtmp.date " -// + "FROM sushilogtmp " -// + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlcreateRuleSushiLog); -// -// stmt.close(); -// ConnectDB.getConnection().close(); -// log.info("Sushi Tmp Tables Created"); -// } catch (Exception e) { -// log.error("Failed to create tables: " + e); -// throw new Exception("Failed to create tables: " + e.toString(), e); -// } -// } - public void processIrusStats() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping sushilogtmp_json table"); - String dropSushilogtmpJson = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sushilogtmp_json"; - stmt.executeUpdate(dropSushilogtmpJson); - logger.info("Dropped sushilogtmp_json table"); - - logger.info("Creating irus_sushilogtmp_json table"); - String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " Type: STRING,\n" - + " Value: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` ARRAY<\n" - + " struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + " >\n" - + ")\n" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(createSushilogtmpJson); - logger.info("Created irus_sushilogtmp_json table"); - - logger.info("Dropping irus_sushilogtmp table"); - String dropSushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(dropSushilogtmp); - logger.info("Dropped irus_sushilogtmp table"); - - logger.info("Creating irus_sushilogtmp table"); - String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(createSushilogtmp); - logger.info("Created irus_sushilogtmp table"); - - logger.info("Inserting to irus_sushilogtmp table"); - String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " - + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " - + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " - + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " - + "WHERE `ItemIdent`.`Type`= 'OAI'"; - stmt.executeUpdate(insertSushilogtmp); - logger.info("Inserted to irus_sushilogtmp table"); - - logger.info("Creating downloads_stats table"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmt.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats table"); - - logger.info("Inserting into downloads_stats"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_oids ro " - + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; - stmt.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats"); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - logger.info("Inserting to sushilog table"); - String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " - + ConnectDB.getUsageStatsDBSchema() - + ".irus_sushilogtmp"; - stmt.executeUpdate(insertToShushilog); - logger.info("Inserted to sushilog table"); - - ConnectDB.getHiveConnection().close(); - } - - public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); - - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) - + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; - - logger.info("(getIrusRRReport) Getting report: " + reportUrl); - - String text = getJson(reportUrl, "", ""); - - List opendoarsToVisit = new ArrayList(); - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - int i = 0; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); - for (Object identifier : itemIdentifier) { - JSONObject opendoar = (JSONObject) identifier; - if (opendoar.get("Type").toString().equals("OpenDOAR")) { - i++; - opendoarsToVisit.add(opendoar.get("Value").toString()); - break; - } - } - // break; - } - - logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); - - if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 - && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); - } - - logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); - - for (String opendoar : opendoarsToVisit) { - logger.info("Now working on openDoar: " + opendoar); - this.getIrusIRReport(opendoar, irusUKReportPath); - } - - logger.info("(getIrusRRReport) Finished with report: " + reportUrl); - } - - private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { - - logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); - - ConnectDB.getHiveConnection().setAutoCommit(false); - - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, "opendoar____::" + opendoar); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - int batch_size = 0; - - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); - } else { - while (start.before(end)) { - logger.info("date: " + simpleDateFormat.format(start.getTime())); - String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) - + "&RepositoryIdentifier=opendoar%3A" + opendoar - + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; - start.add(Calendar.MONTH, 1); - - logger.info("Downloading file: " + reportUrl); - String text = getJson(reportUrl, "", ""); - if (text == null) { - continue; - } - - FileSystem fs = FileSystem.get(new Configuration()); - String filePath = irusUKReportPath + "/" + "IrusIRReport_" - + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePath); - FSDataOutputStream fin = fs.create(new Path(filePath), true); - - JSONParser parser = new JSONParser(); - JSONObject jsonObject = (JSONObject) parser.parse(text); - jsonObject = (JSONObject) jsonObject.get("ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Report"); - jsonObject = (JSONObject) jsonObject.get("Customer"); - JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); - if (jsonArray == null) { - continue; - } - String oai = ""; - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - fin.write(jsonObjectRow.toJSONString().getBytes()); - fin.writeChar('\n'); - } - - fin.close(); - } - - } - //ConnectDB.getHiveConnection().close(); - - logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); - } - - private String getJson(String url) throws Exception { - try { - System.out.println("===> Connecting to: " + url); - URL website = new URL(url); - System.out.println("Connection url -----> " + url); - URLConnection connection = website.openConnection(); - - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); -// response.append("\n"); - } - } - - System.out.println("response ====> " + response.toString()); - - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } - - private String getJson(String url, String username, String password) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL", e); - return null; - } - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java deleted file mode 100644 index 7a61b1f46..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java +++ /dev/null @@ -1,265 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class LaReferenciaDownloadLogs { - - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; - - /* - * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; - - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); - - public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - this.createTables(); -// this.createTmpTables(); - } - - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } - - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " - + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } - -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - private String getPiwikLogUrl() { - return piwikUrl + "/"; - } - - private String getJson(String url) throws Exception { - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); -// response.append("\n"); - } - } - - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + e); - throw new Exception("Failed to get URL: " + e.toString(), e); - } - } - - public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { - - String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; - String content = ""; - - List siteIdsToVisit = new ArrayList(); - - // Getting all the siteIds in a list for logging reasons & limiting the list - // to the max number of siteIds - content = getJson(baseApiUrl); - JSONParser parser = new JSONParser(); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); - } - logger.info("Found the following siteIds for download: " + siteIdsToVisit); - - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } - - logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - - for (int siteId : siteIdsToVisit) { - logger.info("Now working on LaReferencia MatomoId: " + siteId); - this.GetLaReFerenciaLogs(repoLogsPath, siteId); - } - } - - public void GetLaReFerenciaLogs(String repoLogsPath, - int laReferencialMatomoID) throws Exception { - - logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); - - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".lareferencialog WHERE matomoid=?"); - st.setInt(1, laReferencialMatomoID); - Date dateMax = null; - - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - Date date = currDay.getTime(); - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + laReferencialMatomoID); - } else { - logger - .info( - "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " - + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - outFolder = repoLogsPath; - - FileSystem fs = FileSystem.get(new Configuration()); - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), - true); - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - int i = 0; - - JSONParser parser = new JSONParser(); - do { - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - fin.write(jsonObjectRaw.toJSONString().getBytes()); - fin.writeChar('\n'); - } - - logger - .info( - "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID - + " and for " - + sdf.format(date)); - i++; - } while (true); - fin.close(); - } - } - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java deleted file mode 100644 index 747b5ce0e..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java +++ /dev/null @@ -1,436 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URLDecoder; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.sql.Statement; -import java.sql.Timestamp; -import java.text.SimpleDateFormat; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class LaReferenciaStats { - - private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); - - private String logRepoPath; - - private Statement stmt = null; - - private String CounterRobotsURL; - private ArrayList robotsList; - - public LaReferenciaStats(String logRepoPath) throws Exception { - this.logRepoPath = logRepoPath; - this.createTables(); -// this.createTmpTables(); - } - - /* - * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) { - * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } } - */ - private void createTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating LaReferencia tables"); - String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + - "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + - "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + - "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTableLareferenciaLog); - logger.info("Created LaReferencia tables"); -// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialog " -// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," -// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " -// + "FROM lareferencialog " -// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; -// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); -// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Lareferencia Tables Created"); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - // System.exit(0); - } - } - -// private void createTmpTables() throws Exception { -// -// try { -// Statement stmt = ConnectDB.getConnection().createStatement(); -// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; -// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " -// + " ON INSERT TO lareferencialogtmp " -// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," -// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " -// + "FROM lareferencialogtmp " -// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; -// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); -// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); -// -// stmt.close(); -// log.info("Lareferencia Tmp Tables Created"); -// -// } catch (Exception e) { -// log.error("Failed to create tmptables: " + e); -// throw new Exception("Failed to create tmp tables: " + e.toString(), e); -// // System.exit(0); -// } -// } - - public void processLogs() throws Exception { - try { - logger.info("Processing LaReferencia repository logs"); - processlaReferenciaLog(); - logger.info("LaReferencia repository logs process done"); - - logger.info("LaReferencia removing double clicks"); - removeDoubleClicks(); - logger.info("LaReferencia removed double clicks"); - - logger.info("LaReferencia creating viewsStats"); - viewsStats(); - logger.info("LaReferencia created viewsStats"); - logger.info("LaReferencia creating downloadsStats"); - downloadsStats(); - logger.info("LaReferencia created downloadsStats"); - logger.info("LaReferencia updating Production Tables"); - updateProdTables(); - logger.info("LaReferencia updated Production Tables"); - - } catch (Exception e) { - logger.error("Failed to process logs: " + e); - throw new Exception("Failed to process logs: " + e.toString(), e); - } - } - - public void processlaReferenciaLog() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping lareferencialogtmp_json table"); - String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp_json"; - stmt.executeUpdate(drop_lareferencialogtmp_json); - logger.info("Dropped lareferencialogtmp_json table"); - - logger.info("Creating lareferencialogtmp_json"); - String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `repItem` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " timestamp: STRING,\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >,\n" + - " `2`: struct<\n" + - " `customVariablePageValue2`: STRING\n" + - " >\n" + - " >\n" + - " >\n" + - " >" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_lareferencialogtmp_json); - logger.info("Created lareferencialogtmp_json"); - - logger.info("Dropping lareferencialogtmp table"); - String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".lareferencialogtmp"; - stmt.executeUpdate(drop_lareferencialogtmp); - logger.info("Dropped lareferencialogtmp table"); - - logger.info("Creating lareferencialogtmp"); - String create_lareferencialogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " + - "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + - "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + - "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_lareferencialogtmp); - logger.info("Created lareferencialogtmp"); - - logger.info("Inserting into lareferencialogtmp"); - String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " + - "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " + - "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_lareferencialogtmp); - logger.info("Inserted into lareferencialogtmp"); - - stmt.close(); - } - - public void removeDoubleClicks() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning download double clicks"); - // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " + - "AND p1.timestamp listHdfsDir(String dir) throws Exception { - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - // log.info("Found hdfs file " + fileName); - fileNames.add(fileName); - } - // hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath)); - throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - if (!line.equals("[]")) { - sb.append(line); - } - // sb.append(line); - line = br.readLine(); - } - result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - if (result.equals("")) { - result = "[]"; - } - - // fs.close(); - } catch (Exception e) { - logger.error(e.getMessage()); - throw new Exception(e); - } - - return result; - } - -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java deleted file mode 100644 index 8f7fffa9f..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java +++ /dev/null @@ -1,325 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.Authenticator; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class PiwikDownloadLogs { - - private final String piwikUrl; - private Date startDate; - private final String tokenAuth; - - /* - * The Piwik's API method - */ - private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; - private final String format = "&format=json"; - - private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); - - public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { - this.piwikUrl = piwikUrl; - this.tokenAuth = tokenAuth; - - } - - private String getPiwikLogUrl() { - return "https://" + piwikUrl + "/"; - } - - private String getJson(String url) throws Exception { - try { - logger.debug("Connecting to download the JSON: " + url); - URL website = new URL(url); - URLConnection connection = website.openConnection(); - - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - } - } - return response.toString(); - } catch (Exception e) { - logger.error("Failed to get URL: " + url + " Exception: " + e); - throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); - } - } - - class WorkerThread implements Runnable { - - private Calendar currDay; - private int siteId; - private String repoLogsPath; - private String portalLogPath; - private String portalMatomoID; - - public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws IOException { - this.currDay = (Calendar) currDay.clone(); - this.siteId = new Integer(siteId); - this.repoLogsPath = new String(repoLogsPath); - this.portalLogPath = new String(portalLogPath); - this.portalMatomoID = new String(portalMatomoID); - } - - public void run() { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - System.out - .println( - Thread.currentThread().getName() + " (Start) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - try { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - System.out - .println( - Thread.currentThread().getName() + " (End) Thread for " - + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId - + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath - + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); - } - - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - - int i = 0; - - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); - - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - writtenBytes += jsonObjectRawBytes.length + 1; - } - - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); - - i++; - } while (true); - - fs.close(); - } - } - - public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { - - Statement statement = ConnectDB.getHiveConnection().createStatement(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - ResultSet rs = statement - .executeQuery( - "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() - + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); - - // Getting all the piwikids in a list for logging reasons & limitting the list - // to the max number of piwikids - List piwikIdToVisit = new ArrayList(); - while (rs.next()) - piwikIdToVisit.add(rs.getInt(1)); - logger.info("Found the following piwikIds for download: " + piwikIdToVisit); - - if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 - && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { - logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); - } - - logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); - - // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); - for (int siteId : piwikIdToVisit) { - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("Starting period for log download: " + sdf.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("Ending period for log download: " + sdf.format(end.getTime())); - - logger.info("Now working on piwikId: " + siteId); - - PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION - .prepareStatement( - "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog WHERE source=?"); - st.setInt(1, siteId); - Date dateMax = null; - ResultSet rs_date = st.executeQuery(); - while (rs_date.next()) { - logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); - - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { - // logger.info("Date used " + currDay.toString()); - // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - // executor.execute(worker);// calling execute method of ExecutorService - logger.info("Date used " + currDay.getTime().toString()); - - if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); - } else { - GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); - } - - } - } - // executor.shutdown(); - // while (!executor.isTerminated()) { - // } - // System.out.println("Finished all threads"); - } - - public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, - String portalMatomoID) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - - Date date = currDay.getTime(); - logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); - - String period = "&period=day&date=" + sdf.format(date); - String outFolder = ""; - if (siteId == Integer.parseInt(portalMatomoID)) { - outFolder = portalLogPath; - } else { - outFolder = repoLogsPath; - } - - String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format - + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; - String content = ""; - - int i = 0; - - JSONParser parser = new JSONParser(); - StringBuffer totalContent = new StringBuffer(); - FileSystem fs = FileSystem.get(new Configuration()); - - do { - int writtenBytes = 0; - String apiUrl = baseApiUrl; - - if (i > 0) { - apiUrl += "&filter_offset=" + (i * 1000); - } - - content = getJson(apiUrl); - if (content.length() == 0 || content.equals("[]")) { - break; - } - - FSDataOutputStream fin = fs - .create( - new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"), - true); - JSONArray jsonArray = (JSONArray) parser.parse(content); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRaw = (JSONObject) aJsonArray; - byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); - fin.write(jsonObjectRawBytes); - fin.writeChar('\n'); - - writtenBytes += jsonObjectRawBytes.length + 1; - } - - fin.close(); - System.out - .println( - Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes - + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i - + ".json"); - - i++; - } while (true); - - fs.close(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java deleted file mode 100644 index 6d5bdfac0..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ /dev/null @@ -1,1262 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -import java.net.URLDecoder; -import java.sql.Connection; -import java.sql.SQLException; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.*; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class PiwikStatsDB { - - private String logPath; - private String logRepoPath; - private String logPortalPath; - - private Statement stmt = null; - - private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); - - private String CounterRobotsURL; - private ArrayList robotsList; - - public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { - this.logRepoPath = logRepoPath; - this.logPortalPath = logPortalPath; - - } - - public void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); - - logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); - - logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); - - logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); - } - - public void recreateDBAndTables() throws Exception { - this.createDatabase(); - this.createTables(); - // The piwiklog table is not needed since it is built - // on top of JSON files - this.createTmpTables(); - } - - public ArrayList getRobotsList() { - return robotsList; - } - - public void setRobotsList(ArrayList robotsList) { - this.robotsList = robotsList; - } - - public String getCounterRobotsURL() { - return CounterRobotsURL; - } - - public void setCounterRobotsURL(String CounterRobotsURL) { - this.CounterRobotsURL = CounterRobotsURL; - } - - private void createDatabase() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; - stmt.executeUpdate(dropDatabase); - } catch (Exception e) { - logger.error("Failed to drop database: " + e); - throw new Exception("Failed to drop database: " + e.toString(), e); - } - - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); - String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); - stmt.executeUpdate(createDatabase); - - } catch (Exception e) { - logger.error("Failed to create database: " + e); - throw new Exception("Failed to create database: " + e.toString(), e); - } - } - - private void createTables() throws Exception { - try { - stmt = ConnectDB.getHiveConnection().createStatement(); - - // Create Piwiklog table - This table should exist - String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) " - + "into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePiwikLog); - - ///////////////////////////////////////// - // Rule for duplicate inserts @ piwiklog - ///////////////////////////////////////// - - String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log - ////////////////////////////////////////////////// - - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - private void createTmpTables() throws Exception { - try { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " - + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePiwikLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ piwiklogtmp - ////////////////////////////////////////////////// - - ////////////////////////////////////////////////// - // Copy from public.piwiklog to piwiklog - ////////////////////////////////////////////////// - // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; - // stmt.executeUpdate(sqlCopyPublicPiwiklog); - - String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, " - + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " - + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(sqlCreateTmpTablePortalLog); - - ////////////////////////////////////////////////// - // Rule for duplicate inserts @ process_portal_log_tmp - ////////////////////////////////////////////////// - - stmt.close(); - - } catch (Exception e) { - logger.error("Failed to create tmptables: " + e); - throw new Exception("Failed to create tmp tables: " + e.toString(), e); - // System.exit(0); - } - } - - public void processLogs() throws Exception { - try { - ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); - this.robotsList = counterRobots.getRobotsPatterns(); - - logger.info("Processing repository logs"); - processRepositoryLog(); - logger.info("Repository logs process done"); - - logger.info("Removing double clicks"); - removeDoubleClicks(); - logger.info("Removing double clicks done"); - - logger.info("Cleaning oai"); - cleanOAI(); - logger.info("Cleaning oai done"); - - logger.info("Processing portal logs"); - processPortalLog(); - logger.info("Portal logs process done"); - - logger.info("Processing portal usagestats"); - portalStats(); - logger.info("Portal usagestats process done"); - - logger.info("ViewsStats processing starts"); - viewsStats(); - logger.info("ViewsStats processing ends"); - - logger.info("DownloadsStats processing starts"); - downloadsStats(); - logger.info("DownloadsStats processing starts"); - - logger.info("Updating Production Tables"); - updateProdTables(); - logger.info("Updated Production Tables"); - - } catch (Exception e) { - logger.error("Failed to process logs: " + e); - throw new Exception("Failed to process logs: " + e.toString(), e); - } - } - - public void processRepositoryLog() throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping piwiklogtmp_json table"); - String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json"; - stmt.executeUpdate(drop_piwiklogtmp_json); - logger.info("Dropped piwiklogtmp_json table"); - - logger.info("Creating piwiklogtmp_json"); - String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp_json(\n" + - " `idSite` STRING,\n" + - " `idVisit` STRING,\n" + - " `country` STRING,\n" + - " `referrerName` STRING,\n" + - " `browser` STRING,\n" + - " `actionDetails` ARRAY<\n" + - " struct<\n" + - " type: STRING,\n" + - " url: STRING,\n" + - " `customVariables`: struct<\n" + - " `1`: struct<\n" + - " `customVariablePageValue1`: STRING\n" + - " >\n" + - " >,\n" + - " timestamp: String\n" + - " >\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_piwiklogtmp_json); - logger.info("Created piwiklogtmp_json"); - - logger.info("Dropping piwiklogtmp table"); - String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp"; - stmt.executeUpdate(drop_piwiklogtmp); - logger.info("Dropped piwiklogtmp"); - - logger.info("Creating piwiklogtmp"); - String create_piwiklogtmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_piwiklogtmp); - logger.info("Created piwiklogtmp"); - - logger.info("Inserting into piwiklogtmp"); - String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + - "actiondetail.type as action, actiondetail.url as url, " + - "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + - "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + - "referrerName as referrer_name, browser as agent\n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_piwiklogtmp); - logger.info("Inserted into piwiklogtmp"); - - stmt.close(); - } - - public void removeDoubleClicks() throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning download double clicks"); - // clean download double clicks - String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "WHERE EXISTS (\n" + - "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + - ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + - "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" - + - "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + - "AND p1.timestamp\n" + - " >\n" + - ")\n" + - "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + - "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_process_portal_log_tmp_json); - logger.info("Created process_portal_log_tmp_json"); - - logger.info("Droping process_portal_log_tmp table"); - String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp"; - stmt.executeUpdate(drop_process_portal_log_tmp); - logger.info("Dropped process_portal_log_tmp"); - - logger.info("Creating process_portal_log_tmp"); - String create_process_portal_log_tmp = "CREATE TABLE " + - ConnectDB.getUsageStatsDBSchema() + - ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + - "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + - "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; - stmt.executeUpdate(create_process_portal_log_tmp); - logger.info("Created process_portal_log_tmp"); - - logger.info("Inserting into process_portal_log_tmp"); - String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".process_portal_log_tmp " + - "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " - + - "actiondetail.url as url, " + - "CASE\n" + - " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + - " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " - + - " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + - " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + - " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + - " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + - " ELSE '' " + - "END AS entity_id, " + - "CASE " + - " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + - " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + - " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + - " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + - " ELSE '' " + - "END AS source_item_type, " + - "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + - "browser as agent " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + - "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; - stmt.executeUpdate(insert_process_portal_log_tmp); - logger.info("Inserted into process_portal_log_tmp"); - - stmt.close(); - } - - public void portalStats() throws SQLException { - Connection con = ConnectDB.getHiveConnection(); - Statement stmt = con.createStatement(); - con.setAutoCommit(false); - -// Original queries where of the style -// -// SELECT DISTINCT source, id_visit, country, action, url, roid.oid, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp2, -// openaire_prod_stats_20200821.result_oids roid -// WHERE entity_id IS NOT null AND entity_id=roid.oid AND roid.oid IS NOT null -// -// The following query is an example of how queries should be -// -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL); -// -// We should consider if we would like the queries to be as the following -// -// INSERT INTO usagestats_20200907.piwiklogtmp -// SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent -// FROM usagestats_20200907.process_portal_log_tmp -// WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id != '' AND process_portal_log_tmp.entity_id -// IN (SELECT roid.oid FROM openaire_prod_stats_20200821.result_oids roid WHERE roid.oid IS NOT NULL AND -// roid.oid != ''); - - logger.info("PortalStats - Step 1"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".result_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("PortalStats - Step 2"); - stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - /* - * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " + - * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent " - * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + - * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close(); - */ - logger.info("PortalStats - Step 3"); - stmt = con.createStatement(); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " - + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + - "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + - "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() - + ".project_oids roid WHERE roid.id IS NOT NULL)"; - stmt.executeUpdate(sql); - stmt.close(); - - con.close(); - } - - private void cleanOAI() throws Exception { - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Cleaning oai - Step 1"); - stmt = ConnectDB.getHiveConnection().createStatement(); - String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + - "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 2"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + - "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 3"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + - "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 4"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + - "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 5"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + - "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 6"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + - "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 7"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + - "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 8"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + - "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 9"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + - "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 10"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + - "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 11"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + - "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 12"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + - "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 13"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + - "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 14"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + - "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 15"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + - "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 16"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + - "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 17"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + - "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 18"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + - "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 19"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + - "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 20"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + - "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 21"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + - "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 22"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + - "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 23"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + - "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 24"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + - "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 25"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + - "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 26"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + - "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 27"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + - "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 28"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + - "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Step 29"); - stmt = ConnectDB.getHiveConnection().createStatement(); - sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + - "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + - "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; - stmt.executeUpdate(sql); - stmt.close(); - - logger.info("Cleaning oai - Done, closing connection"); - ConnectDB.getHiveConnection().close(); - } - - private String processPortalURL(String url) { - - if (url.indexOf("explore.openaire.eu") > 0) { - try { - url = URLDecoder.decode(url, "UTF-8"); - } catch (Exception e) { - logger.info("Error when decoding the following URL: " + url); - } - if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); - } else if (url.indexOf("datasource=") > 0 - && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { - url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); - } else if (url.indexOf("datasourceFilter=") > 0 - && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { - url = "datasource|" - + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); - } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); - } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { - url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); - } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 - && !url.contains("oai:dnet:corda")) { - url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); - } else if (url.indexOf("organizationId=") > 0 - && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { - url = "organization|" - + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); - } else { - url = ""; - } - } else { - url = ""; - } - - return url; - } - - private void updateProdTables() throws SQLException { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Inserting data to piwiklog"); - String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to views_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to downloads_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Inserting data to pageviews_stats"); - sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + - "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; - stmt.executeUpdate(sql); - - logger.info("Creating usage_stats table"); - String createUsageStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats " + - "AS SELECT coalesce(ds.source, vs.source) as source, " + - "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + - "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + - "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + - "coalesce(ds.openaire, 0) as openaire_downloads, " + - "coalesce(vs.openaire, 0) as openaire_views " + - "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + - ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + - "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; - stmt.executeUpdate(createUsageStats); - logger.info("Created usage_stats table"); - - - /* - * logger.info("Dropping table views_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table downloads_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table pageviews_stats_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(sql); - * logger.info("Dropping table process_portal_log_tmp"); sql = "DROP TABLE IF EXISTS " + - * ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; stmt.executeUpdate(sql); - */ - stmt.close(); - ConnectDB.getHiveConnection().close(); - - } - - private ArrayList listHdfsDir(String dir) throws Exception { - - FileSystem hdfs = FileSystem.get(new Configuration()); - RemoteIterator Files; - ArrayList fileNames = new ArrayList<>(); - - try { - Path exportPath = new Path(hdfs.getUri() + dir); - Files = hdfs.listFiles(exportPath, false); - while (Files.hasNext()) { - String fileName = Files.next().getPath().toString(); - fileNames.add(fileName); - } - - hdfs.close(); - } catch (Exception e) { - logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); - throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); - } - - return fileNames; - } - - private String readHDFSFile(String filename) throws Exception { - String result; - try { - - FileSystem fs = FileSystem.get(new Configuration()); - // log.info("reading file : " + filename); - - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); - - StringBuilder sb = new StringBuilder(); - String line = br.readLine(); - - while (line != null) { - if (!line.equals("[]")) { - sb.append(line); - } - // sb.append(line); - line = br.readLine(); - } - result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); - if (result.equals("")) { - result = "[]"; - } - - // fs.close(); - } catch (Exception e) { - logger.error(e.getMessage()); - throw new Exception(e); - } - - return result; - } - - private Connection getConnection() throws SQLException { - return ConnectDB.getHiveConnection(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java deleted file mode 100644 index 1708a1c64..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ReadCounterRobotsList.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package eu.dnetlib.oa.graph.usagestats.export; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -/** - * @author D. Pierrakos, S. Zoupanos - */ -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.ArrayList; - -import org.json.JSONException; -import org.json.simple.JSONArray; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; - -public class ReadCounterRobotsList { - - private ArrayList robotsPatterns = new ArrayList(); - private String COUNTER_ROBOTS_URL; - - public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { - COUNTER_ROBOTS_URL = url; - robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); - } - - private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { - InputStream is = new URL(url).openStream(); - JSONParser parser = new JSONParser(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); - JSONArray jsonArray = (JSONArray) parser.parse(reader); - for (Object aJsonArray : jsonArray) { - org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; - robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); - } - return robotsPatterns; - } - - public ArrayList getRobotsPatterns() { - return robotsPatterns; - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java deleted file mode 100644 index 06e350c9e..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java +++ /dev/null @@ -1,575 +0,0 @@ -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.*; -// import java.io.BufferedReader; -// import java.io.InputStreamReader; -import java.net.URL; -import java.net.URLConnection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author D. Pierrakos, S. Zoupanos - */ -public class SarcStats { - - private Statement stmtHive = null; - private Statement stmtImpala = null; - - private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); - - public SarcStats() throws Exception { -// createTables(); - } - - private void createTables() throws Exception { - try { - - stmtHive = ConnectDB.getHiveConnection().createStatement(); - String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; - stmtHive.executeUpdate(sqlCreateTableSushiLog); - - // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; - // stmt.executeUpdate(sqlCopyPublicSushiLog); - String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " - + " ON INSERT TO sushilog " - + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," - + "sushilog.rid, sushilog.date " - + "FROM sushilog " - + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; - stmtHive.executeUpdate(sqlcreateRuleSushiLog); - String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; - stmtHive.executeUpdate(createSushiIndex); - - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - logger.info("Sushi Tables Created"); - } catch (Exception e) { - logger.error("Failed to create tables: " + e); - throw new Exception("Failed to create tables: " + e.toString(), e); - } - } - - public void reCreateLogDirs() throws IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); - - logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); - - logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); - - logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); - dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); - } - - public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Adding JSON Serde jar"); - stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); - logger.info("Added JSON Serde jar"); - - logger.info("Dropping sarc_sushilogtmp_json_array table"); - String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); - logger.info("Dropped sarc_sushilogtmp_json_array table"); - - logger.info("Creating sarc_sushilogtmp_json_array table"); - String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" - + " `ItemIdentifier` ARRAY<\n" - + " struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >\n" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - logger.info("Created sarc_sushilogtmp_json_array table"); - - logger.info("Dropping sarc_sushilogtmp_json_non_array table"); - String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - logger.info("Dropped sarc_sushilogtmp_json_non_array table"); - - logger.info("Creating sarc_sushilogtmp_json_non_array table"); - String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " - + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" - + " `ItemIdentifier` struct<\n" - + " `Type`: STRING,\n" - + " `Value`: STRING\n" - + " >,\n" - + " `ItemPerformance` struct<\n" - + " `Period`: struct<\n" - + " `Begin`: STRING,\n" - + " `End`: STRING\n" - + " >,\n" - + " `Instance`: struct<\n" - + " `Count`: STRING,\n" - + " `MetricType`: STRING\n" - + " >\n" - + " >" - + ")" - + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" - + "LOCATION '" + sarcsReportPathNonArray + "/'\n" - + "TBLPROPERTIES (\"transactional\"=\"false\")"; - stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - logger.info("Created sarc_sushilogtmp_json_non_array table"); - - logger.info("Creating sarc_sushilogtmp table"); - String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp(source STRING, repository STRING, " - + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " - + "tblproperties('transactional'='true')"; - stmt.executeUpdate(create_sarc_sushilogtmp); - logger.info("Created sarc_sushilogtmp table"); - - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " - + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " - + "WHERE `ItemIdent`.`Type`='DOI'"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); - - logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " - + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " - + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " - + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; - stmt.executeUpdate(insert_sarc_sushilogtmp); - logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); - - ConnectDB.getHiveConnection().close(); - } - - public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - - Statement stmt = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmt.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - logger.info("Dropping sarc_sushilogtmp table"); - String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp"; - stmt.executeUpdate(drop_sarc_sushilogtmp); - logger.info("Dropped sarc_sushilogtmp table"); - ConnectDB.getHiveConnection().close(); - - List issnAndUrls = new ArrayList(); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" - }); - issnAndUrls.add(new String[]{ - "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" - }); - issnAndUrls.add(new String[]{ - "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" - }); - - if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 - && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { - logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); - issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); - } - - logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); - - for (String[] issnAndUrl : issnAndUrls) { - logger.info("Now working on ISSN: " + issnAndUrl[1]); - getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); - } - - } - - public void finalizeSarcStats() throws Exception { - stmtHive = ConnectDB.getHiveConnection().createStatement(); - ConnectDB.getHiveConnection().setAutoCommit(false); - stmtImpala = ConnectDB.getImpalaConnection().createStatement(); - - logger.info("Creating downloads_stats table_tmp"); - String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp " - + "(`source` string, " - + "`repository_id` string, " - + "`result_id` string, " - + "`date` string, " - + "`count` bigint, " - + "`openaire` bigint)"; - stmtHive.executeUpdate(createDownloadsStats); - logger.info("Created downloads_stats_tmp table"); - - logger.info("Dropping sarc_sushilogtmp_impala table"); - String drop_sarc_sushilogtmp_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala"; - stmtHive.executeUpdate(drop_sarc_sushilogtmp_impala); - logger.info("Dropped sarc_sushilogtmp_impala table"); - - logger.info("Creating sarc_sushilogtmp_impala, a table readable by impala"); - String createSarcSushilogtmpImpala = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala " - + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(createSarcSushilogtmpImpala); - logger.info("Created sarc_sushilogtmp_impala"); - - logger.info("Making sarc_sushilogtmp visible to impala"); - String invalidateMetadata = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".sarc_sushilogtmp_impala;"; - stmtImpala.executeUpdate(invalidateMetadata); - - logger.info("Dropping downloads_stats_impala table"); - String drop_downloads_stats_impala = "DROP TABLE IF EXISTS " - + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala"; - stmtHive.executeUpdate(drop_downloads_stats_impala); - logger.info("Dropped downloads_stats_impala table"); - - logger.info("Making downloads_stats_impala deletion visible to impala"); - try { - String invalidateMetadataDownloadsStatsImpala = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala;"; - stmtImpala.executeUpdate(invalidateMetadataDownloadsStatsImpala); - } catch (SQLException sqle) { - } - - // We run the following query in Impala because it is faster - logger.info("Creating downloads_stats_impala"); - String createDownloadsStatsImpala = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_impala AS " - + "SELECT s.source, d.id AS repository_id, " - + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " - + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_impala s, " - + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " - + ConnectDB.getStatsDBSchema() + ".result_pids ro " - + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " - + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; - stmtImpala.executeUpdate(createDownloadsStatsImpala); - logger.info("Creating downloads_stats_impala"); - - // Insert into downloads_stats - logger.info("Inserting data from downloads_stats_impala into downloads_stats_tmp"); - String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".downloads_stats_tmp SELECT * " - + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_impala"; - stmtHive.executeUpdate(insertDStats); - logger.info("Inserted into downloads_stats_tmp"); - - logger.info("Creating sushilog table"); - String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog " - + "(`source` string, " - + "`repository_id` string, " - + "`rid` string, " - + "`date` string, " - + "`metric_type` string, " - + "`count` int)"; - stmtHive.executeUpdate(createSushilog); - logger.info("Created sushilog table"); - - // Insert into sushilog - logger.info("Inserting into sushilog"); - String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() - + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; - stmtHive.executeUpdate(insertSushiLog); - logger.info("Inserted into sushilog"); - - stmtHive.close(); - ConnectDB.getHiveConnection().close(); - } - - public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, - String url, String issn) throws Exception { - logger.info("Processing SARC! issn: " + issn + " with url: " + url); - ConnectDB.getHiveConnection().setAutoCommit(false); - - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - // Setting the starting period - Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); - logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); - - // Setting the ending period (last day of the month) - Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); - end.add(Calendar.MONTH, +1); - end.add(Calendar.DAY_OF_MONTH, -1); - logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); - - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); - PreparedStatement st = ConnectDB - .getHiveConnection() - .prepareStatement( - "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); - st.setString(1, issn); - ResultSet rs_date = st.executeQuery(); - Date dateMax = null; - while (rs_date.next()) { - if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") - && !rs_date.getString(1).equals("")) { - start.setTime(sdf.parse(rs_date.getString(1))); - dateMax = sdf.parse(rs_date.getString(1)); - } - } - rs_date.close(); - - // Creating the needed configuration for the correct storing of data - Configuration config = new Configuration(); - config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); - config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); - config - .set( - "fs.hdfs.impl", - org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - config - .set( - "fs.file.impl", - org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem dfs = FileSystem.get(config); - - if (dateMax != null && start.getTime().compareTo(dateMax) <= 0) { - logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); - } else { - - while (start.before(end)) { - String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" - + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); - start.add(Calendar.MONTH, 1); - - logger.info("(getARReport) Getting report: " + reportUrl); - String text = getJson(reportUrl); - if (text == null) { - continue; - } - - JSONParser parser = new JSONParser(); - JSONObject jsonObject = null; - try { - jsonObject = (JSONObject) parser.parse(text); - } // if there is a parsing error continue with the next url - catch (ParseException pe) { - continue; - } - - jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); - jsonObject = (JSONObject) jsonObject.get("sc:Report"); - if (jsonObject == null) { - continue; - } - jsonObject = (JSONObject) jsonObject.get("c:Report"); - jsonObject = (JSONObject) jsonObject.get("c:Customer"); - Object obj = jsonObject.get("c:ReportItems"); - JSONArray jsonArray = new JSONArray(); - if (obj instanceof JSONObject) { - jsonArray.add(obj); - } else { - jsonArray = (JSONArray) obj; - // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); - } - if (jsonArray == null) { - continue; - } - - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathArray); - FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); - - // Creating the file in the filesystem for the ItemIdentifier as array object - String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" - + simpleDateFormat.format(start.getTime()) + ".json"; - logger.info("Storing to file: " + filePathNonArray); - FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); - - for (Object aJsonArray : jsonArray) { - - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - renameKeysRecursively(":", jsonObjectRow); - - if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { - finNonArray.write(jsonObjectRow.toJSONString().getBytes()); - finNonArray.writeChar('\n'); - } else { - finArray.write(jsonObjectRow.toJSONString().getBytes()); - finArray.writeChar('\n'); - } - } - - finArray.close(); - finNonArray.close(); - - // Check the file size and if it is too big, delete it - File fileArray = new File(filePathArray); - if (fileArray.length() == 0) - fileArray.delete(); - File fileNonArray = new File(filePathNonArray); - if (fileNonArray.length() == 0) - fileNonArray.delete(); - - } - - dfs.close(); - } - //ConnectDB.getHiveConnection().close(); - } - - private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { - for (Object jjval : givenJsonObj) { - if (jjval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jjval); - } else if (jjval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jjval); - } // All other types of vals - else - ; - } - } - - private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { - Set jkeys = new HashSet(givenJsonObj.keySet()); - for (String jkey : jkeys) { - - String[] splitArray = jkey.split(delimiter); - String newJkey = splitArray[splitArray.length - 1]; - - Object jval = givenJsonObj.get(jkey); - givenJsonObj.remove(jkey); - givenJsonObj.put(newJkey, jval); - - if (jval instanceof JSONObject) { - renameKeysRecursively(delimiter, (JSONObject) jval); - } - - if (jval instanceof JSONArray) { - renameKeysRecursively(delimiter, (JSONArray) jval); - } - } - } - - private String getJson(String url) throws Exception { - // String cred=username+":"+password; - // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); - try { - URL website = new URL(url); - URLConnection connection = website.openConnection(); - // connection.setRequestProperty ("Authorization", "Basic "+encoded); - StringBuilder response; - try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { - response = new StringBuilder(); - String inputLine; - while ((inputLine = in.readLine()) != null) { - response.append(inputLine); - response.append("\n"); - } - } - return response.toString(); - } catch (Exception e) { - - // Logging error and silently continuing - logger.error("Failed to get URL: " + e); - System.out.println("Failed to get URL: " + e); -// return null; -// throw new Exception("Failed to get URL: " + e.toString(), e); - } - return ""; - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java deleted file mode 100644 index 405b58bd5..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ /dev/null @@ -1,179 +0,0 @@ - -package eu.dnetlib.oa.graph.usagestats.export; - -import java.io.IOException; -import java.sql.SQLException; -import java.sql.Statement; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Main class for downloading and processing Usage statistics - * - * @author D. Pierrakos, S. Zoupanos - */ -public class UsageStatsExporter { - - public UsageStatsExporter() { - - } - - private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); - - private void reCreateLogDirs() throws IllegalArgumentException, IOException { - FileSystem dfs = FileSystem.get(new Configuration()); - - logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); - - logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); - - logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); - - logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); - - logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); - - logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); - dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); - } - - public void export() throws Exception { - - logger.info("Initialising DB properties"); - ConnectDB.init(); - -// runImpalaQuery(); - - PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - - logger.info("Re-creating database and tables"); - if (ExecuteWorkflow.recreateDbAndTables) - piwikstatsdb.recreateDBAndTables(); - ; - - logger.info("Initializing the download logs module"); - PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - - if (ExecuteWorkflow.piwikEmptyDirs) { - logger.info("Recreating Piwik log directories"); - piwikstatsdb.reCreateLogDirs(); - } - - // Downloading piwik logs (also managing directory creation) - if (ExecuteWorkflow.downloadPiwikLogs) { - logger.info("Downloading piwik logs"); - piwd - .GetOpenAIRELogs( - ExecuteWorkflow.repoLogPath, - ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); - } - logger.info("Downloaded piwik logs"); - - // Create DB tables, insert/update statistics - String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; - piwikstatsdb.setCounterRobotsURL(cRobotsUrl); - - if (ExecuteWorkflow.processPiwikLogs) { - logger.info("Processing logs"); - piwikstatsdb.processLogs(); - } - - logger.info("Creating LaReferencia tables"); - LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, - ExecuteWorkflow.lareferenciaAuthToken); - - if (ExecuteWorkflow.laReferenciaEmptyDirs) { - logger.info("Recreating LaReferencia log directories"); - lrf.reCreateLogDirs(); - } - - if (ExecuteWorkflow.downloadLaReferenciaLogs) { - logger.info("Downloading LaReferencia logs"); - lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); - logger.info("Downloaded LaReferencia logs"); - } - LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); - - if (ExecuteWorkflow.processLaReferenciaLogs) { - logger.info("Processing LaReferencia logs"); - lastats.processLogs(); - logger.info("LaReferencia logs done"); - } - - IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); - if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { - logger.info("Creating Irus Stats tables"); - irusstats.createTables(); - logger.info("Created Irus Stats tables"); - - logger.info("Re-create log dirs"); - irusstats.reCreateLogDirs(); - logger.info("Re-created log dirs"); - } - - if (ExecuteWorkflow.irusDownloadReports) { - irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); - } - if (ExecuteWorkflow.irusProcessStats) { - irusstats.processIrusStats(); - logger.info("Irus done"); - } - - SarcStats sarcStats = new SarcStats(); - if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { - sarcStats.reCreateLogDirs(); - } - if (ExecuteWorkflow.sarcDownloadReports) { - sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - } - if (ExecuteWorkflow.sarcProcessStats) { - sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); - sarcStats.finalizeSarcStats(); - } - logger.info("Sarc done"); - - // finalize usagestats - if (ExecuteWorkflow.finalizeStats) { - piwikstatsdb.finalizeStats(); - logger.info("Finalized stats"); - } - - // Make the tables available to Impala - if (ExecuteWorkflow.finalTablesVisibleToImpala) { - logger.info("Making tables visible to Impala"); - invalidateMetadata(); - } - - logger.info("End"); - } - - private void invalidateMetadata() throws SQLException { - Statement stmt = null; - - stmt = ConnectDB.getImpalaConnection().createStatement(); - - String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; - stmt.executeUpdate(sql); - - sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; - stmt.executeUpdate(sql); - - stmt.close(); - ConnectDB.getHiveConnection().close(); - } -} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json deleted file mode 100644 index 988c23b48..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json +++ /dev/null @@ -1,231 +0,0 @@ -[ - { - "paramName": "mat", - "paramLongName": "matomoAuthToken", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "mbu", - "paramLongName": "matomoBaseURL", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "rlp", - "paramLongName": "repoLogPath", - "paramDescription": "nameNode of the source cluster", - "paramRequired": true - }, - { - "paramName": "plp", - "paramLongName": "portalLogPath", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "pmi", - "paramLongName": "portalMatomoID", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "iukbuw", - "paramLongName": "irusUKBaseURL", - "paramDescription": "working directory", - "paramRequired": true - }, - { - "paramName": "iukrp", - "paramLongName": "irusUKReportPath", - "paramDescription": "maximum number of map tasks used in the distcp process", - "paramRequired": true - }, - { - "paramName": "srpa", - "paramLongName": "sarcsReportPathArray", - "paramDescription": "memory for distcp action copying actionsets from remote cluster", - "paramRequired": true - }, - { - "paramName": "srpna", - "paramLongName": "sarcsReportPathNonArray", - "paramDescription": "timeout for distcp copying actions from remote cluster", - "paramRequired": true - }, - { - "paramName": "llp", - "paramLongName": "lareferenciaLogPath", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lbu", - "paramLongName": "lareferenciaBaseURL", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "lat", - "paramLongName": "lareferenciaAuthToken", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbhu", - "paramLongName": "dbHiveUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "dbiu", - "paramLongName": "dbImpalaUrl", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "usdbs", - "paramLongName": "usageStatsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "sdbs", - "paramLongName": "statsDBSchema", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - }, - { - "paramName": "rdbt", - "paramLongName": "recreateDbAndTables", - "paramDescription": "Re-create database and initial tables?", - "paramRequired": true - }, - { - "paramName": "pwed", - "paramLongName": "piwikEmptyDirs", - "paramDescription": "Empty piwik directories?", - "paramRequired": true - }, - { - "paramName": "ppwl", - "paramLongName": "processPiwikLogs", - "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dpwl", - "paramLongName": "downloadPiwikLogs", - "paramDescription": "download piwik logs?", - "paramRequired": true - }, - { - "paramName": "slp", - "paramLongName": "startingLogPeriod", - "paramDescription": "Starting log period", - "paramRequired": true - }, - { - "paramName": "elp", - "paramLongName": "endingLogPeriod", - "paramDescription": "Ending log period", - "paramRequired": true - }, - { - "paramName": "npidd", - "paramLongName": "numberOfPiwikIdsToDownload", - "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", - "paramRequired": true - }, - { - "paramName": "nsidd", - "paramLongName": "numberOfSiteIdsToDownload", - "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", - "paramRequired": true - }, - { - "paramName": "lerd", - "paramLongName": "laReferenciaEmptyDirs", - "paramDescription": "Empty LaReferencia directories?", - "paramRequired": true - }, - { - "paramName": "plrl", - "paramLongName": "processLaReferenciaLogs", - "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", - "paramRequired": true - }, - { - "paramName": "dlrl", - "paramLongName": "downloadLaReferenciaLogs", - "paramDescription": "download La Referencia logs?", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "irusCreateTablesEmptyDirs", - "paramDescription": "Irus section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "irusDownloadReports", - "paramDescription": "Irus section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "irusProcessStats", - "paramDescription": "Irus section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "irusNumberOfOpendoarsToDownload", - "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", - "paramRequired": true - }, - { - "paramName": "icted", - "paramLongName": "sarcCreateTablesEmptyDirs", - "paramDescription": "Sarc section: Create tables and empty JSON directories?", - "paramRequired": true - }, - { - "paramName": "idr", - "paramLongName": "sarcDownloadReports", - "paramDescription": "Sarc section: Download reports?", - "paramRequired": true - }, - { - "paramName": "ipr", - "paramLongName": "sarcProcessStats", - "paramDescription": "Sarc section: Process stats?", - "paramRequired": true - }, - { - "paramName": "inod", - "paramLongName": "sarcNumberOfIssnToDownload", - "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", - "paramRequired": true - }, - - { - "paramName": "fs", - "paramLongName": "finalizeStats", - "paramDescription": "Create the usage_stats table?", - "paramRequired": true - }, - { - "paramName": "ftvi", - "paramLongName": "finalTablesVisibleToImpala", - "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", - "paramRequired": true - }, - { - "paramName": "nodt", - "paramLongName": "numberOfDownloadThreads", - "paramDescription": "Number of download threads", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml deleted file mode 100644 index b5c807378..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/config-default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - jobTracker - ${jobTracker} - - - nameNode - ${nameNode} - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 - - - impalaJdbcUrl - jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; - - - oozie.wf.workflow.notification.url - {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status - - - oozie.use.system.libpath - true - - diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml deleted file mode 100644 index 8d62a85a9..000000000 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml +++ /dev/null @@ -1,90 +0,0 @@ - - - - hiveMetastoreUris - Hive server metastore URIs - - - hiveJdbcUrl - Hive server jdbc url - - - impalaJdbcUrl - Impala server jdbc url - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hiveMetastoreUris} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - eu.dnetlib.oa.graph.usagestats.export.ExecuteWorkflow - --matomoAuthToken${matomoAuthToken} - --matomoBaseURL${matomoBaseURL} - --repoLogPath${repoLogPath} - --portalLogPath${portalLogPath} - --portalMatomoID${portalMatomoID} - --irusUKBaseURL${irusUKBaseURL} - --irusUKReportPath${irusUKReportPath} - --sarcsReportPathArray${sarcsReportPathArray} - --sarcsReportPathNonArray${sarcsReportPathNonArray} - --lareferenciaLogPath${lareferenciaLogPath} - --lareferenciaBaseURL${lareferenciaBaseURL} - --lareferenciaAuthToken${lareferenciaAuthToken} - --dbHiveUrl${hiveJdbcUrl} - --dbImpalaUrl${impalaJdbcUrl} - --usageStatsDBSchema${usageStatsDBSchema} - --statsDBSchema${statsDBSchema} - --recreateDbAndTables${recreateDbAndTables} - --piwikEmptyDirs${piwikEmptyDirs} - --downloadPiwikLogs${downloadPiwikLogs} - --processPiwikLogs${processPiwikLogs} - --startingLogPeriod${startingLogPeriod} - --endingLogPeriod${endingLogPeriod} - --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} - --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} - --laReferenciaEmptyDirs${laReferenciaEmptyDirs} - --downloadLaReferenciaLogs${downloadLaReferenciaLogs} - --processLaReferenciaLogs${processLaReferenciaLogs} - --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} - --irusDownloadReports${irusDownloadReports} - --irusProcessStats${irusProcessStats} - --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} - --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} - --sarcDownloadReports${sarcDownloadReports} - --sarcProcessStats${sarcProcessStats} - --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} - --finalizeStats${finalizeStats} - --finalTablesVisibleToImpala${finalTablesVisibleToImpala} - --numberOfDownloadThreads${numberOfDownloadThreads} - - - - - - - -