From af62b14f91fd2da7f904d66d9e6e1eb9837294d7 Mon Sep 17 00:00:00 2001 From: Spyros Zoupanos Date: Thu, 7 May 2020 19:00:03 +0300 Subject: [PATCH] Adding the main java files, the directory structure and main workflow file --- dhp-workflows/dhp-usage-stats-update/pom.xml | 32 + .../graph/usage-stats/export/ConnectDB.java | 66 ++ .../usage-stats/export/ExecuteWorkflow.java | 43 + .../graph/usage-stats/export/IrusStats.java | 431 +++++++ .../usage-stats/export/PiwikDownloadLogs.java | 132 +++ .../usage-stats/export/PiwikStatsDB.java | 1022 +++++++++++++++++ .../export/ReadCounterRobotsList.java | 56 + .../graph/usage-stats/export/SarcStats.java | 255 ++++ .../export/UsageStatsExporter.java | 57 + .../oa/graph/usage-stats/export/index.html | 43 + .../usage-stats/oozie_app/config-default.xml | 30 + .../graph/usage-stats/oozie_app/workflow.xml | 76 ++ dhp-workflows/pom.xml | 1 + 13 files changed, 2244 insertions(+) create mode 100644 dhp-workflows/dhp-usage-stats-update/pom.xml create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ConnectDB.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ExecuteWorkflow.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/IrusStats.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikDownloadLogs.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikStatsDB.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ReadCounterRobotsList.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/SarcStats.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/UsageStatsExporter.java create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/index.html create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-usage-stats-update/pom.xml b/dhp-workflows/dhp-usage-stats-update/pom.xml new file mode 100644 index 000000000..f85872fbd --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/pom.xml @@ -0,0 +1,32 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + dhp-usage-stats-update + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + false + + + + + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ConnectDB.java new file mode 100644 index 000000000..d4b9e6786 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ConnectDB.java @@ -0,0 +1,66 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.usagestats.export; + +/* + @author dpie + */ + +/* + @author dpie + */ +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +public abstract class ConnectDB { + + private static Connection DB_CONNECTION; + + private static String dbURL; + private static String dbUsername; + private static String dbPassword; + private static String defaultDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init(Properties properties) throws ClassNotFoundException { + + dbURL = properties.getProperty("Stats_db_Url"); + dbUsername = properties.getProperty("Stats_db_User"); + dbPassword = properties.getProperty("Stats_db_Pass"); + defaultDBSchema = properties.getProperty("Stats_db_Schema"); + + Class.forName(properties.getProperty("Stats_db_Driver")); + } + + public static Connection getConnection() throws SQLException { + if (DB_CONNECTION != null && !DB_CONNECTION.isClosed()) { + return DB_CONNECTION; + } else { + DB_CONNECTION = connect(); + + return DB_CONNECTION; + } + } + + private static Connection connect() throws SQLException { + Connection connection = DriverManager.getConnection(dbURL, dbUsername, dbPassword); + Statement stmt = connection.createStatement(); + String sqlSetSearchPath = "SET search_path TO " + defaultDBSchema + ";"; + stmt.executeUpdate(sqlSetSearchPath); + stmt.close(); + + log.debug("Opened database successfully"); + + return connection; + } + +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ExecuteWorkflow.java new file mode 100644 index 000000000..3e980c4bd --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ExecuteWorkflow.java @@ -0,0 +1,43 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.usagestats.export; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author dpie + */ +public class ExecuteWorkflow { + + public static void main(String args[]) throws Exception { + + Properties prop = new Properties(); + InputStream propertiesInputStream = UsageStatsExporter.class + .getClassLoader() + .getResourceAsStream("usagestats.properties"); + prop.load(propertiesInputStream); + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(prop); + usagestatsExport.export(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/IrusStats.java new file mode 100644 index 000000000..8062ce428 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/IrusStats.java @@ -0,0 +1,431 @@ + +package eu.dnetlib.usagestats.export; + +/** + * + * @author dpie + */ + +/** + * @author dpie + */ +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; + +/** + * Created by dpie on 20/01/2020. + */ +public class IrusStats { + + private String irusUKURL; + +// private Connection conn = null; +// private Statement stmt = null; + + private final Logger log = Logger.getLogger(this.getClass()); + + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + createTables(); + createTmpTables(); + } + + private void createTables() throws Exception { + try { + + Statement stmt = ConnectDB.getConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmt.executeUpdate(sqlCreateTableSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmt.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmt.executeUpdate(createSushiIndex); + + stmt.close(); + ConnectDB.getConnection().close(); + log.info("Sushi Tables Created"); + } catch (Exception e) { + log.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private void createTmpTables() throws Exception { + try { + + Statement stmt = ConnectDB.getConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmt.executeUpdate(sqlCreateTableSushiLog); + + // stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;"); + // String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilogtmp " + + " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository," + + "sushilogtmp.rid, sushilogtmp.date " + + "FROM sushilogtmp " + + "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmt.executeUpdate(sqlcreateRuleSushiLog); + + stmt.close(); + ConnectDB.getConnection().close(); + log.info("Sushi Tmp Tables Created"); + } catch (Exception e) { + log.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void irusStats() throws Exception { + Statement stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + + // String sql = "INSERT INTO sushi_result_downloads SELECT s.source, d.id AS repository, ro.id, s.date, s.count + // FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND s.oai=ro.orid AND + // metric_type='ft_total'"; + // String sql = "SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) + // ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count INTO + // downloads_stats FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND + // s.oai=ro.orid AND metric_type='ft_total'"; + // String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id, + // extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') + // as date, s.count FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND + // s.oai=ro.orid AND metric_type='ft_total';"; + String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' FROM sushilogtmp s, public.datasource_oids d, public.result_oids ro WHERE s.repository=d.orid AND s.rid=ro.orid AND metric_type='ft_total' AND s.source='IRUS-UK';"; + stmt.executeUpdate(sql); + + sql = "Insert into sushilog select * from sushilogtmp;"; + stmt.executeUpdate(sql); + + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + } + + public void processIrusRRReport() throws Exception { + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // String reportUrl = "https://irus.jisc.ac.uk" + + // "/api/sushilite/v1_7/GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=2016-01&EndDate=" + + // simpleDateFormat.format(new Date()) + + // "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=2016-01&EndDate=" + + simpleDateFormat.format(new Date()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + + log.info("Getting Irus report: " + reportUrl); + + String text = getJson(reportUrl, "", ""); + + log.info("Report: " + text); + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + // System.out.println(i + ": " + opendoar.get("Value").toString()); + log.info(i + ": " + opendoar.get("Value").toString()); + i++; + processIrusIRReport(opendoar.get("Value").toString()); + break; + } + } + // break; + } + } + + private void processIrusIRReport(String opendoar) throws Exception { + System.out.println(opendoar); + ConnectDB.getConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + Calendar start = Calendar.getInstance(); + start.set(Calendar.YEAR, 2016); + start.set(Calendar.MONTH, Calendar.JANUARY); + // start.setTime(simpleDateFormat.parse("2016-01")); + + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getConnection() + .prepareStatement("SELECT max(date) FROM sushilog WHERE repository=?;"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + PreparedStatement preparedStatement = ConnectDB + .getConnection() + .prepareStatement( + "INSERT INTO sushilogtmp (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); + int batch_size = 0; + + while (start.before(end)) { + // log.info("date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); + + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject oaiPmh = (JSONObject) identifier; + if (oaiPmh.get("Type").toString().equals("OAI")) { + oai = oaiPmh.get("Value").toString(); + // System.out.println("OAI: " + oai); + break; + } + } + + JSONArray itemPerformance = (JSONArray) jsonObjectRow.get("ItemPerformance"); + String period; + String type; + String count; + for (Object perf : itemPerformance) { + JSONObject performance = (JSONObject) perf; + JSONObject periodObj = (JSONObject) performance.get("Period"); + period = periodObj.get("Begin").toString(); + JSONObject instanceObj = (JSONObject) performance.get("Instance"); + type = instanceObj.get("MetricType").toString(); + count = instanceObj.get("Count").toString(); + // System.out.println(oai + " : " + period + " : " + count); + + preparedStatement.setString(1, "IRUS-UK"); + preparedStatement.setString(2, "opendoar____::" + opendoar); + preparedStatement.setString(3, oai); + preparedStatement.setString(4, period); + preparedStatement.setString(5, type); + preparedStatement.setInt(6, Integer.parseInt(count)); + preparedStatement.addBatch(); + batch_size++; + if (batch_size == 10000) { + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + batch_size = 0; + } + } + // break; + } + // break; + } + + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + } + + public void processIrusIRReport(String opendoar, String startDate) throws Exception { + ConnectDB.getConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + Calendar start = Calendar.getInstance(); + start.set(Calendar.YEAR, 2016); + start.set(Calendar.MONTH, Calendar.JANUARY); + // start.setTime(simpleDateFormat.parse("2016-01")); + + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + start.setTime(sdf.parse(startDate)); + + String createTablesQuery = "-- Table: shadow.sushilog" + opendoar + "\n" + + "\n" + + "-- DROP TABLE shadow.sushilog" + opendoar + ";\n" + + "\n" + + "CREATE TABLE shadow.sushilog" + opendoar + "\n" + + "(\n" + + " source text COLLATE pg_catalog.\"default\" NOT NULL,\n" + + " repository text COLLATE pg_catalog.\"default\" NOT NULL,\n" + + " rid text COLLATE pg_catalog.\"default\" NOT NULL,\n" + + " date text COLLATE pg_catalog.\"default\" NOT NULL,\n" + + " metric_type text COLLATE pg_catalog.\"default\" NOT NULL,\n" + + " count integer,\n" + + " CONSTRAINT sushilog" + opendoar + "_pkey PRIMARY KEY (source, repository, rid, date, metric_type)\n" + + " USING INDEX TABLESPACE index_storage\n" + + ")\n" + + "\n" + + "TABLESPACE pg_default;\n" + + "\n" + + "ALTER TABLE shadow.sushilog" + opendoar + "\n" + + " OWNER to sqoop;\n" + + "\n" + + "-- Rule: ignore_duplicate_inserts ON shadow.sushilog" + opendoar + "\n" + + "\n" + + "-- DROP Rule ignore_duplicate_inserts ON shadow.sushilog" + opendoar + ";\n" + + "\n" + + "CREATE OR REPLACE RULE ignore_duplicate_inserts AS\n" + + " ON INSERT TO shadow.sushilog" + opendoar + "\n" + + " WHERE (EXISTS ( SELECT sushilog" + opendoar + ".source,\n" + + " sushilog" + opendoar + ".repository,\n" + + " sushilog" + opendoar + ".rid,\n" + + " sushilog" + opendoar + ".date\n" + + " FROM sushilog" + opendoar + "\n" + + " WHERE sushilog" + opendoar + ".source = new.source AND sushilog" + opendoar + + ".repository = new.repository AND sushilog" + opendoar + ".rid = new.rid AND sushilog" + opendoar + + ".date = new.date AND sushilog" + opendoar + ".metric_type = new.metric_type))\n" + + " DO INSTEAD\n" + + "NOTHING;"; + + Statement stCreateTables = ConnectDB.getConnection().createStatement(); + stCreateTables.execute(createTablesQuery); + ConnectDB.getConnection().commit(); + + PreparedStatement preparedStatement = ConnectDB + .getConnection() + .prepareStatement( + "INSERT INTO sushilog" + opendoar + + " (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); + int batch_size = 0; + + while (start.before(end)) { + // log.info("date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = "https://irus.jisc.ac.uk/api/sushilite/v1_7/GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=2019-10-31&RepositoryIdentifier=opendoar%3A" + + opendoar + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); + + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject oaiPmh = (JSONObject) identifier; + if (oaiPmh.get("Type").toString().equals("OAI")) { + oai = oaiPmh.get("Value").toString(); + // System.out.println("OAI: " + oai); + break; + } + } + + JSONArray itemPerformance = (JSONArray) jsonObjectRow.get("ItemPerformance"); + String period; + String type; + String count; + for (Object perf : itemPerformance) { + JSONObject performance = (JSONObject) perf; + JSONObject periodObj = (JSONObject) performance.get("Period"); + period = periodObj.get("Begin").toString(); + JSONObject instanceObj = (JSONObject) performance.get("Instance"); + type = instanceObj.get("MetricType").toString(); + count = instanceObj.get("Count").toString(); + // System.out.println(oai + " : " + period + " : " + count); + + preparedStatement.setString(1, "IRUS-UK"); + preparedStatement.setString(2, "opendoar____::" + opendoar); + preparedStatement.setString(3, oai); + preparedStatement.setString(4, period); + preparedStatement.setString(5, type); + preparedStatement.setInt(6, Integer.parseInt(count)); + preparedStatement.addBatch(); + batch_size++; + if (batch_size == 10000) { + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + batch_size = 0; + } + } + // break; + } + // break; + } + + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + } + + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + log.error("Failed to get URL", e); + return null; + } + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikDownloadLogs.java new file mode 100644 index 000000000..ab6645c3e --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikDownloadLogs.java @@ -0,0 +1,132 @@ + +package eu.dnetlib.usagestats.export; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.Logger; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Calendar; + +public class PiwikDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + + private final Logger log = Logger.getLogger(this.getClass()); + + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + + } + + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + log.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + + Statement statement = ConnectDB.getConnection().createStatement(); + + ResultSet rs = statement.executeQuery("SELECT distinct piwik_id from public.datasource where piwik_id is not null order by piwik_id;"); + while (rs.next()) { + int siteId = rs.getInt(1); + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + Calendar start = Calendar.getInstance(); + start.set(Calendar.YEAR, 2016); + start.set(Calendar.MONTH, Calendar.MARCH); + //start.setTime(simpleDateFormat.parse("2016-01")); + + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB.DB_CONNECTION.prepareStatement("SELECT max(timestamp) FROM piwiklog WHERE source=? HAVING max(timestamp) is not null;"); + st.setInt(1, siteId); + + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + + for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) { + log.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + //portal siteId = 109; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true); + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + while (!content.equals("[]\n")) { + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + + fin.write(content.getBytes()); + + i++; + } + fin.close(); + + } + + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikStatsDB.java new file mode 100644 index 000000000..e4e706745 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/PiwikStatsDB.java @@ -0,0 +1,1022 @@ + +package eu.dnetlib.usagestats.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; + +public class PiwikStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private final Logger log = Logger.getLogger(this.getClass()); + private String CounterRobotsURL; + private ArrayList robotsList; + + public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + this.createTables(); + this.createTmpTables(); + } + + public void foo() { + Stream s = Arrays.stream(new String[] { + "a", "b", "c", "d" + }); + + System.out.println(s.parallel().count()); + } + + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getConnection().createStatement(); + String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS piwiklog(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; + String sqlcreateRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO piwiklog " + + " WHERE (EXISTS ( SELECT piwiklog.source, piwiklog.id_visit," + + "piwiklog.action, piwiklog.\"timestamp\", piwiklog.entity_id " + + "FROM piwiklog " + + "WHERE piwiklog.source = new.source AND piwiklog.id_visit = new.id_visit AND piwiklog.action = new.action AND piwiklog.entity_id = new.entity_id AND piwiklog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; + String sqlCreateRuleIndexPiwikLog = "create index if not exists piwiklog_rule on piwiklog(source, id_visit, action, entity_id, \"timestamp\");"; + stmt.executeUpdate(sqlCreateTablePiwikLog); + stmt.executeUpdate(sqlcreateRulePiwikLog); + stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); + + String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, timestamp));"; + String sqlcreateRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO process_portal_log " + + " WHERE (EXISTS ( SELECT process_portal_log.source, process_portal_log.id_visit," + + "process_portal_log.\"timestamp\" " + + "FROM process_portal_log " + + "WHERE process_portal_log.source = new.source AND process_portal_log.id_visit = new.id_visit AND process_portal_log.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; + String sqlCreateRuleIndexPortalLog = "create index if not exists process_portal_log_rule on process_portal_log(source, id_visit, \"timestamp\");"; + stmt.executeUpdate(sqlCreateTablePortalLog); + stmt.executeUpdate(sqlcreateRulePortalLog); + stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); + + stmt.close(); + ConnectDB.getConnection().close(); + log.info("Usage Tables Created"); + + } catch (Exception e) { + log.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + private void createTmpTables() throws Exception { + try { + Statement stmt = ConnectDB.getConnection().createStatement(); + String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS piwiklogtmp(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; + String sqlcreateTmpRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO piwiklogtmp " + + " WHERE (EXISTS ( SELECT piwiklogtmp.source, piwiklogtmp.id_visit," + + "piwiklogtmp.action, piwiklogtmp.\"timestamp\", piwiklogtmp.entity_id " + + "FROM piwiklogtmp " + + "WHERE piwiklogtmp.source = new.source AND piwiklogtmp.id_visit = new.id_visit AND piwiklogtmp.action = new.action AND piwiklogtmp.entity_id = new.entity_id AND piwiklogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; + stmt.executeUpdate(sqlCreateTmpTablePiwikLog); + stmt.executeUpdate(sqlcreateTmpRulePiwikLog); + + // String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; + // stmt.executeUpdate(sqlCopyPublicPiwiklog); + String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log_tmp(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, timestamp));"; + String sqlcreateTmpRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO process_portal_log_tmp " + + " WHERE (EXISTS ( SELECT process_portal_log_tmp.source, process_portal_log_tmp.id_visit," + + "process_portal_log_tmp.\"timestamp\" " + + "FROM process_portal_log_tmp " + + "WHERE process_portal_log_tmp.source = new.source AND process_portal_log_tmp.id_visit = new.id_visit AND process_portal_log_tmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; + stmt.executeUpdate(sqlCreateTmpTablePortalLog); + stmt.executeUpdate(sqlcreateTmpRulePortalLog); + + stmt.close(); + log.info("Usage Tmp Tables Created"); + + } catch (Exception e) { + log.error("Failed to create tmptables: " + e); + throw new Exception("Failed to create tmp tables: " + e.toString(), e); + // System.exit(0); + } + } + + public void processLogs() throws Exception { + try { + ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); + this.robotsList = counterRobots.getRobotsPatterns(); + + processRepositoryLog(); + log.info("repository process done"); + removeDoubleClicks(); + log.info("removing double clicks done"); + cleanOAI(); + log.info("cleaning oai done"); + + viewsStats(); + downloadsStats(); + + processPortalLog(); + log.info("portal process done"); + + portalStats(); + log.info("portal usagestats done"); + + updateProdTables(); + log.info("updateProdTables done"); + + } catch (Exception e) { + log.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + +// public void usageStats() throws Exception { +// try { +// viewsStats(); +// downloadsStats(); +// log.info("stat tables and views done"); +// } catch (Exception e) { +// log.error("Failed to create usage usagestats: " + e); +// throw new Exception("Failed to create usage usagestats: " + e.toString(), e); +// } +// } + + public void processRepositoryLog() throws Exception { + Statement stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + + ArrayList jsonFiles = listHdfsDir(this.logRepoPath); +// File dir = new File(this.logRepoPath); +// File[] jsonFiles = dir.listFiles(); + + PreparedStatement prepStatem = ConnectDB + .getConnection() + .prepareStatement( + "INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)"); + int batch_size = 0; + JSONParser parser = new JSONParser(); + for (String jsonFile : jsonFiles) { + System.out.println(jsonFile); + JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); + String idVisit = jsonObjectRow.get("idVisit").toString(); + String country = jsonObjectRow.get("country").toString(); + String referrerName = jsonObjectRow.get("referrerName").toString(); + String agent = jsonObjectRow.get("browser").toString(); + boolean botFound = false; + Iterator it = robotsList.iterator(); + while (it.hasNext()) { + // Create a Pattern object + Pattern r = Pattern.compile(it.next().toString()); + // Now create matcher object. + Matcher m = r.matcher(agent); + if (m.find()) { + // System.out.println("Found value: " + m.group(0)); + botFound = true; + break; + } + } + if (botFound == false) { + String sourceItemType = "repItem"; + + JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); + for (Object actionDetail : actionDetails) { + JSONObject actionDetailsObj = (JSONObject) actionDetail; + + if (actionDetailsObj.get("customVariables") != null) { + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + Timestamp timestamp = new Timestamp( + Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); + String url = actionDetailsObj.get("url").toString(); + String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")) + .get("1")).get("customVariablePageValue1").toString(); + String action = actionDetailsObj.get("type").toString(); + + prepStatem.setInt(1, idSite); + prepStatem.setString(2, idVisit); + prepStatem.setString(3, country); + prepStatem.setString(4, action); + prepStatem.setString(5, url); + prepStatem.setString(6, oaipmh); + prepStatem.setString(7, sourceItemType); + prepStatem.setString(8, simpleDateFormat.format(timestamp)); + prepStatem.setString(9, referrerName); + prepStatem.setString(10, agent); + prepStatem.addBatch(); + batch_size++; + if (batch_size == 10000) { + prepStatem.executeBatch(); + ConnectDB.getConnection().commit(); + batch_size = 0; + } + } + } + } + } + } + prepStatem.executeBatch(); + ConnectDB.getConnection().commit(); + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + Statement stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + + // clean download double clicks + String sql = "DELETE FROM piwiklogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM piwiklogtmp p1, piwiklogtmp p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp jsonFiles = listHdfsDir(this.logPortalPath); +// File folder = new File(this.logPortalPath); +// File[] jsonFiles = folder.listFiles(); + + PreparedStatement prepStatem = ConnectDB + .getConnection() + .prepareStatement( + "INSERT INTO process_portal_log_tmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)"); + int batch_size = 0; + JSONParser parser = new JSONParser(); + for (String jsonFile : jsonFiles) { + JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); + + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); + String idVisit = jsonObjectRow.get("idVisit").toString(); + String country = jsonObjectRow.get("country").toString(); + String referrerName = jsonObjectRow.get("referrerName").toString(); + String agent = jsonObjectRow.get("browser").toString(); + boolean botFound = false; + Iterator it = robotsList.iterator(); + while (it.hasNext()) { + // Create a Pattern object + Pattern r = Pattern.compile(it.next().toString()); + // Now create matcher object. + Matcher m = r.matcher(agent); + if (m.find()) { + botFound = true; + break; + } + } + if (botFound == false) { + JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); + for (Object actionDetail : actionDetails) { + JSONObject actionDetailsObj = (JSONObject) actionDetail; + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + Timestamp timestamp = new Timestamp( + Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); + + String action = actionDetailsObj.get("type").toString(); + String url = actionDetailsObj.get("url").toString(); + + String entityID = processPortalURL(url); + String sourceItemType = ""; + + if (entityID.indexOf("|") > 0) { + sourceItemType = entityID.substring(0, entityID.indexOf("|")); + entityID = entityID.substring(entityID.indexOf("|") + 1); + } + + prepStatem.setInt(1, idSite); + prepStatem.setString(2, idVisit); + prepStatem.setString(3, country); + prepStatem.setString(4, action); + prepStatem.setString(5, url); + prepStatem.setString(6, entityID); + prepStatem.setString(7, sourceItemType); + prepStatem.setString(8, simpleDateFormat.format(timestamp)); + prepStatem.setString(9, referrerName); + prepStatem.setString(10, agent); + + prepStatem.addBatch(); + batch_size++; + if (batch_size == 10000) { + prepStatem.executeBatch(); + ConnectDB.getConnection().commit(); + batch_size = 0; + } + } + } + } + } + prepStatem.executeBatch(); + ConnectDB.getConnection().commit(); + + stmt.close(); + ConnectDB.getConnection().close(); + } + + public void portalStats() throws SQLException { + Connection con = ConnectDB.getConnection(); + Statement stmt = con.createStatement(); + con.setAutoCommit(false); + + String sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'oaItem\', timestamp, referrer_name, agent FROM process_portal_log_tmp, public.result_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; + stmt.executeUpdate(sql); + stmt.close(); +// con.commit(); + + stmt = con.createStatement(); + sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'datasource\', timestamp, referrer_name, agent FROM process_portal_log_tmp, public.datasource_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; + stmt.executeUpdate(sql); + stmt.close(); +// con.commit(); + + stmt = con.createStatement(); + sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'organization\', timestamp, referrer_name, agent FROM process_portal_log_tmp, public.organization_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; + stmt.executeUpdate(sql); + stmt.close(); +// con.commit(); + + stmt = con.createStatement(); + sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'project\', timestamp, referrer_name, agent FROM process_portal_log_tmp, public.project_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; + stmt.executeUpdate(sql); + stmt.close(); +// con.commit(); + + con.close(); + } + + private void cleanOAI() throws Exception { + ConnectDB.getConnection().setAutoCommit(false); + + stmt = ConnectDB.getConnection().createStatement(); + String sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/','oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/','oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/','oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/','oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/','oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/','oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/','oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/','oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/','oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/','oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/','oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/','oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/','oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/','oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/','oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/','oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/','oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/','oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/','oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/','oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/','oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/','oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/','oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/','oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/','oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/','oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/','oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + stmt = ConnectDB.getConnection().createStatement(); + sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/','oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%';"; + stmt.executeUpdate(sql); + stmt.close(); + ConnectDB.getConnection().commit(); + + ConnectDB.getConnection().close(); + } + + private String processPortalURL(String url) { + + if (url.indexOf("explore.openaire.eu") > 0) { + try { + url = URLDecoder.decode(url, "UTF-8"); + } catch (Exception e) { + log.info(url); + } + if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); + } else if (url.indexOf("datasource=") > 0 + && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { + url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); + } else if (url.indexOf("datasourceFilter=") > 0 + && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { + url = "datasource|" + + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); + } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); + } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { + url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); + } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 + && !url.contains("oai:dnet:corda")) { + url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); + } else if (url.indexOf("organizationId=") > 0 + && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { + url = "organization|" + + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); + } else { + url = ""; + } + } else { + url = ""; + } + + return url; + } + + private void updateProdTables() throws SQLException { + Statement stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + + String sql = "insert into piwiklog select * from piwiklogtmp;"; + stmt.executeUpdate(sql); + + sql = "insert into views_stats select * from views_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "insert into downloads_stats select * from downloads_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "insert into pageviews_stats select * from pageviews_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "DROP TABLE IF EXISTS views_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "DROP TABLE IF EXISTS downloads_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "DROP TABLE IF EXISTS pageviews_stats_tmp;"; + stmt.executeUpdate(sql); + + sql = "DROP TABLE IF EXISTS process_portal_log_tmp;"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + + log.info("updateProdTables done"); + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + log.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + log.error(e); + throw new Exception(e); + } + + return result; + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ReadCounterRobotsList.java new file mode 100644 index 000000000..e840e6e6c --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/ReadCounterRobotsList.java @@ -0,0 +1,56 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.usagestats.export; + +/** + * + * @author dpie + */ + +/** + * @author dpie + */ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; + +import org.json.JSONException; +import org.json.simple.JSONArray; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class ReadCounterRobotsList { + + private ArrayList robotsPatterns = new ArrayList(); + private String COUNTER_ROBOTS_URL; + + public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { + COUNTER_ROBOTS_URL = url; + robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); + } + + private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { + InputStream is = new URL(url).openStream(); + JSONParser parser = new JSONParser(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); + JSONArray jsonArray = (JSONArray) parser.parse(reader); + for (Object aJsonArray : jsonArray) { + org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; + robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); + } + return robotsPatterns; + } + + public ArrayList getRobotsPatterns() { + return robotsPatterns; + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/SarcStats.java new file mode 100644 index 000000000..83fbc0205 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/SarcStats.java @@ -0,0 +1,255 @@ + +package eu.dnetlib.usagestats.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.Calendar; + +import org.apache.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; + +/** + * Created by dpie + */ +public class SarcStats { + + private Statement stmt = null; + + private final Logger log = Logger.getLogger(this.getClass()); + + public SarcStats() throws Exception { + createTables(); + } + + private void createTables() throws Exception { + try { + + stmt = ConnectDB.getConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmt.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmt.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmt.executeUpdate(createSushiIndex); + + stmt.close(); + ConnectDB.getConnection().close(); + log.info("Sushi Tables Created"); + } catch (Exception e) { + log.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void processSarc() throws Exception { + processARReport("https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"); + processARReport("https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"); + processARReport("https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"); + processARReport("https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"); + processARReport("https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"); + processARReport("https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"); + processARReport("https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"); + processARReport("https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"); + processARReport("https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"); + processARReport("https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"); + processARReport("https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"); + processARReport("https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"); + processARReport("https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"); + processARReport("https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"); + processARReport("https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"); + processARReport("https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"); + processARReport("https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"); + } + + public void sarcStats() throws Exception { + stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + + // String sql = "SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) + // ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' INTO + // downloads_stats FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND + // s.rid=ro.orid AND metric_type='ft_total'"; + String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' FROM sushilog s, public.datasource_oids d, public.datasource_results dr, public.result_pids ro WHERE d.orid LIKE '%' || s.repository || '%' AND dr.id=d.id AND dr.result=ro.id AND s.rid=ro.pid AND ro.type='doi' AND metric_type='ft_total' AND s.source='SARC-OJS';"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + } + + public void processARReport(String url, String issn) throws Exception { + log.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + Calendar start = Calendar.getInstance(); + start.set(Calendar.YEAR, 2016); + start.set(Calendar.MONTH, Calendar.JANUARY); + // start.setTime(simpleDateFormat.parse("2016-01")); + + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getConnection() + .prepareStatement("SELECT max(date) FROM sushilog WHERE repository=?;"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + } + } + rs_date.close(); + + PreparedStatement preparedStatement = ConnectDB + .getConnection() + .prepareStatement( + "INSERT INTO sushilog (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); + int batch_size = 0; + + while (start.before(end)) { + // String reportUrl = + // "http://irus.mimas.ac.uk/api/sushilite/v1_7/GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + // + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + // "&RepositoryIdentifier=opendoar%3A" + opendoar + + // "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + // System.out.println(reportUrl); + start.add(Calendar.MONTH, 1); + + String text = getJson(reportUrl); + if (text == null) { + continue; + } + + /* + * PrintWriter wr = new PrintWriter(new FileWriter("logs/" + simpleDateFormat.format(start.getTime()) + + * ".json")); wr.print(text); wr.close(); + */ + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } + + String rid = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = new JSONArray(); + obj = jsonObjectRow.get("c:ItemIdentifier"); + if (obj instanceof JSONObject) { + itemIdentifier.add(obj); + } else { + // JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("c:ItemIdentifier"); + itemIdentifier = (JSONArray) obj; + } + for (Object identifier : itemIdentifier) { + JSONObject doi = (JSONObject) identifier; + if (doi.get("c:Type").toString().equals("DOI")) { + rid = doi.get("c:Value").toString(); + // System.out.println("DOI: " + rid); + break; + } + } + if (rid.isEmpty()) { + continue; + } + + JSONObject itemPerformance = (JSONObject) jsonObjectRow.get("c:ItemPerformance"); + // for (Object perf : itemPerformance) { + JSONObject performance = (JSONObject) itemPerformance; + JSONObject periodObj = (JSONObject) performance.get("c:Period"); + String period = periodObj.get("c:Begin").toString(); + JSONObject instanceObj = (JSONObject) performance.get("c:Instance"); + String type = instanceObj.get("c:MetricType").toString(); + String count = instanceObj.get("c:Count").toString(); + // System.out.println(rid + " : " + period + " : " + count); + + preparedStatement.setString(1, "SARC-OJS"); + preparedStatement.setString(2, issn); + // preparedStatement.setString(2, url); + preparedStatement.setString(3, rid); + preparedStatement.setString(4, period); + preparedStatement.setString(5, type); + preparedStatement.setInt(6, Integer.parseInt(count)); + preparedStatement.addBatch(); + batch_size++; + if (batch_size == 10000) { + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + batch_size = 0; + } + // } + + // break; + } + // break; + } + + preparedStatement.executeBatch(); + ConnectDB.getConnection().commit(); + ConnectDB.getConnection().close(); + } + + private String getJson(String url) { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + log.error("Failed to get URL: " + e); + // System.out.println("Failed to get URL: " + e); + return null; + // throw new Exception("Failed to get URL: " + e.toString(), e); + } + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/UsageStatsExporter.java new file mode 100644 index 000000000..436b87d87 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/UsageStatsExporter.java @@ -0,0 +1,57 @@ + +package eu.dnetlib.usagestats.export; + +import java.io.InputStream; +import java.util.Properties; + +import org.apache.log4j.Logger; + +public class UsageStatsExporter { + + private Logger log = Logger.getLogger(this.getClass()); + private Properties properties; + + public UsageStatsExporter(Properties properties) { + this.properties = properties; + } + + // public void export() throws Exception { + public void export() throws Exception { + + // read workdflow parameters + String matomoAuthToken = properties.getProperty("matomo_AuthToken"); + String matomoBaseURL = properties.getProperty("matomo_BaseUrl"); + String repoLogPath = properties.getProperty("repo_LogPath"); + String portalLogPath = properties.getProperty("portal_LogPath"); + String portalMatomoID = properties.getProperty("portal_MatomoID"); + String irusUKBaseURL = properties.getProperty("IRUS_UK_BaseUrl"); + + // connect to DB + ConnectDB.init(properties); + + PiwikDownloadLogs piwd = new PiwikDownloadLogs(matomoBaseURL, matomoAuthToken); + piwd.GetOpenAIRELogs(repoLogPath, portalLogPath, portalMatomoID); + + /* + * Create DB tables, insert/update statistics + */ + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(repoLogPath, portalLogPath); + piwikstatsdb.setCounterRobotsURL(properties.getProperty("COUNTER_robots_Url")); + piwikstatsdb.processLogs(); + log.info("process logs done"); + + IrusStats irusstats = new IrusStats(irusUKBaseURL); + irusstats.processIrusRRReport(); + irusstats.irusStats(); + log.info("irus done"); + + SarcStats sarcStats = new SarcStats(); + sarcStats.processSarc(); + sarcStats.sarcStats(); + log.info("sarc done"); + + // finalize usagestats + piwikstatsdb.finalizeStats(); + log.info("finalized stats"); + } +} diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/index.html b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/index.html new file mode 100644 index 000000000..9eac2342f --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usage-stats/export/index.html @@ -0,0 +1,43 @@ + + + Revision 58415: /dnet45/modules/dnet-openaire-usage-stats-export-wf/trunk/dnet-openaire-usage-stats-export/src/main/java/eu/dnetlib/usagestats/export + + +

Revision 58415: /dnet45/modules/dnet-openaire-usage-stats-export-wf/trunk/dnet-openaire-usage-stats-export/src/main/java/eu/dnetlib/usagestats/export

+ +
+ + Powered by + Subversion + version 1.4.4 (r25188). + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/config-default.xml new file mode 100644 index 000000000..ba7002cff --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/workflow.xml new file mode 100644 index 000000000..70d4dcffc --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usage-stats/oozie_app/workflow.xml @@ -0,0 +1,76 @@ + + + + stats_db_name + the target stats database name + + + openaire_db_name + the original graph database name + + + external_stats_db_name + stats_ext + the external stats that should be added since they are not included in the graph database + + + hiveMetastoreUris + hive server metastore URIs + + + hiveJdbcUrl + hive server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + eu.dnetlib.oa.graph.usage-stats.export.UsageStatsExporter + org.apache.oozie.test.MyTest + ${outputFileName} + + + + + + + + + + + ${hiveJdbcUrl} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + + + + + + + diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index ea3433903..ca82cf1fa 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -26,6 +26,7 @@ dhp-dedup-scholexplorer dhp-graph-provision-scholexplorer dhp-stats-update + dhp-usage-stats-update dhp-broker-events