diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java index 2d3f7b08b8..f266108968 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -128,7 +128,7 @@ public class PiwikStatsDB { + ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " - + "stored as orc tblproperties('transactional'='true');"; + + "stored as orc tblproperties('transactional'='true')"; stmt.executeUpdate(sqlCreateTmpTablePiwikLog); ////////////////////////////////////////////////// @@ -168,8 +168,8 @@ public class PiwikStatsDB { this.robotsList = counterRobots.getRobotsPatterns(); System.out.println("====> Processing repository logs"); -// processRepositoryLog(); - System.out.println("====> Repository process done"); + processRepositoryLog(); + System.out.println("====> Repository logs process done"); log.info("repository process done"); System.out.println("====> Removing double clicks"); @@ -183,16 +183,20 @@ public class PiwikStatsDB { log.info("cleaning oai done"); System.out.println("====> ViewsStats processing starts"); - viewsStats(); +// viewsStats(); System.out.println("====> ViewsStats processing ends"); - + System.out.println("====> DownloadsStats processing starts"); - downloadsStats(); +// downloadsStats(); System.out.println("====> DownloadsStats processing starts"); + System.out.println("====> Processing portal logs"); processPortalLog(); + System.out.println("====> Portal logs process done"); log.info("portal process done"); + System.exit(0); + portalStats(); log.info("portal usagestats done"); @@ -339,44 +343,41 @@ public class PiwikStatsDB { ".result_views_monthly_tmp"; stmt.executeUpdate(drop_result_views_monthly_tmp); System.out.println("====> Dropped result_views_monthly_tmp table"); - + System.out.println("====> Creating result_views_monthly_tmp table"); - String create_result_views_monthly_tmp = - "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".result_views_monthly_tmp " + + String create_result_views_monthly_tmp = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".result_views_monthly_tmp " + "AS SELECT entity_id AS id, " + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + - "AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM `usagestats_13`.piwiklogtmp where action='action' and (source_item_type='oaItem' or " + - "source_item_type='repItem') " + + "source_item_type='repItem') " + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + "source ORDER BY source, entity_id"; stmt.executeUpdate(create_result_views_monthly_tmp); System.out.println("====> Created result_views_monthly_tmp table"); - - System.out.println("====> Dropping views_stats_tmp table"); String drop_views_stats_tmp = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(drop_views_stats_tmp); System.out.println("====> Dropped views_stats_tmp table"); - + System.out.println("====> Creating views_stats_tmp table"); - String create_views_stats_tmp = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp " + + String create_views_stats_tmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".views_stats_tmp " + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(views) AS count, max(openaire_referrer) AS openaire " + + "max(views) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_views_monthly_tmp p, " + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source!='5' AND p.source=d.piwik_id AND p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + + "WHERE p.source!='5' AND p.source=d.piwik_id AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month"; stmt.executeUpdate(create_views_stats_tmp); System.out.println("====> Created views_stats_tmp table"); - - + System.out.println("====> Dropping views_stats table"); String drop_views_stats = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + @@ -385,43 +386,40 @@ public class PiwikStatsDB { System.out.println("====> Dropped views_stats table"); System.out.println("====> Creating views_stats table"); - String create_view_stats = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + String create_view_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats_tmp"; stmt.executeUpdate(create_view_stats); System.out.println("====> Created views_stats table"); - - + System.out.println("====> Dropping pageviews_stats_tmp table"); String drop_pageviews_stats_tmp = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(drop_pageviews_stats_tmp); System.out.println("====> Dropped pageviews_stats_tmp table"); - + System.out.println("====> Creating pageviews_stats_tmp table"); - String create_pageviews_stats_tmp = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp AS SELECT " + - "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + String create_pageviews_stats_tmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pageviews_stats_tmp AS SELECT " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_views_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source='23' AND p.source=d.piwik_id and p.id=ro.oid \n" + - "GROUP BY d.id, ro.id, month " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source='23' AND p.source=d.piwik_id and p.id=ro.oid \n" + + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month"; stmt.executeUpdate(create_pageviews_stats_tmp); System.out.println("====> Created pageviews_stats_tmp table"); - System.out.println("====> Droping pageviews_stats table"); String drop_pageviews_stats = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; stmt.executeUpdate(drop_pageviews_stats); System.out.println("====> Dropped pageviews_stats table"); - + System.out.println("====> Creating pageviews_stats table"); - String create_pageviews_stats = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pageviews_stats " + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats_tmp"; stmt.executeUpdate(create_pageviews_stats); System.out.println("====> Created pageviews_stats table"); @@ -440,53 +438,48 @@ public class PiwikStatsDB { ".result_views_monthly_tmp"; stmt.executeUpdate(drop_result_views_monthly_tmp); System.out.println("====> Dropped result_downloads_monthly_tmp view"); - + System.out.println("====> Creating result_views_monthly_tmp view"); - String sql = - "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".result_downloads_monthly_tmp " + - "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + - "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + - "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".result_downloads_monthly_tmp " + + "AS SELECT entity_id AS id, COUNT(entity_id) as downloads, " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp where action='download' " + - "AND (source_item_type='oaItem' OR source_item_type='repItem') " + - "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + "ORDER BY source, entity_id, month"; stmt.executeUpdate(sql); System.out.println("====> Created result_views_monthly_tmp view"); - System.out.println("====> Dropping downloads_stats_tmp table"); String drop_views_stats = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(drop_views_stats); System.out.println("====> Dropped downloads_stats_tmp table"); - + System.out.println("====> Creating downloads_stats_tmp view"); - sql = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS " + + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp AS " + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + - "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".result_downloads_monthly_tmp p, " + - ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + - "WHERE p.source=d.piwik_id and p.id=ro.oid " + - "GROUP BY d.id, ro.id, month " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id and p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + "ORDER BY d.id, ro.id, month"; System.out.println("====> Created downloads_stats_tmp view"); stmt.executeUpdate(sql); - - System.out.println("====> Dropping downloads_stats table"); String drop_pageviews_stats = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; stmt.executeUpdate(drop_pageviews_stats); System.out.println("====> Dropped downloads_stats table"); - + System.out.println("====> Creating downloads_stats table"); - String create_pageviews_stats = - "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + "STORED AS PARQUET AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats_tmp"; stmt.executeUpdate(create_pageviews_stats); System.out.println("====> Created downloads_stats table"); @@ -498,7 +491,7 @@ public class PiwikStatsDB { stmt.close(); ConnectDB.getConnection().close(); - + System.exit(0); } @@ -723,93 +716,93 @@ public class PiwikStatsDB { ConnectDB.getConnection().close(); } - // Import OPENAIRE Logs to DB public void processPortalLog() throws Exception { Statement stmt = ConnectDB.getConnection().createStatement(); ConnectDB.getConnection().setAutoCommit(false); - ArrayList jsonFiles = listHdfsDir(this.logPortalPath); -// File folder = new File(this.logPortalPath); -// File[] jsonFiles = folder.listFiles(); + System.out.println("====> Dropping process_portal_log_tmp_json table"); + String drop_process_portal_log_tmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; + stmt.executeUpdate(drop_process_portal_log_tmp_json); + System.out.println("====> Dropped process_portal_log_tmp_json table"); - PreparedStatement prepStatem = ConnectDB - .getConnection() - .prepareStatement( - "INSERT INTO process_portal_log_tmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)"); - int batch_size = 0; - JSONParser parser = new JSONParser(); - for (String jsonFile : jsonFiles) { - JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); + System.out.println("====> Creating process_portal_log_tmp_json"); + String create_process_portal_log_tmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json(" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + UsageStatsExporter.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_process_portal_log_tmp_json); + System.out.println("====> Created process_portal_log_tmp_json"); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); - String idVisit = jsonObjectRow.get("idVisit").toString(); - String country = jsonObjectRow.get("country").toString(); - String referrerName = jsonObjectRow.get("referrerName").toString(); - String agent = jsonObjectRow.get("browser").toString(); - boolean botFound = false; - Iterator it = robotsList.iterator(); - while (it.hasNext()) { - // Create a Pattern object - Pattern r = Pattern.compile(it.next().toString()); - // Now create matcher object. - Matcher m = r.matcher(agent); - if (m.find()) { - botFound = true; - break; - } - } - if (botFound == false) { - JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); - for (Object actionDetail : actionDetails) { - JSONObject actionDetailsObj = (JSONObject) actionDetail; + System.out.println("====> Droping process_portal_log_tmp table"); + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; + stmt.executeUpdate(drop_process_portal_log_tmp); + System.out.println("====> Dropped process_portal_log_tmp"); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - Timestamp timestamp = new Timestamp( - Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); + System.out.println("====> Creating process_portal_log_tmp"); + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_process_portal_log_tmp); + System.out.println("====> Created process_portal_log_tmp"); - String action = actionDetailsObj.get("type").toString(); - String url = actionDetailsObj.get("url").toString(); - - String entityID = processPortalURL(url); - String sourceItemType = ""; - - if (entityID.indexOf("|") > 0) { - sourceItemType = entityID.substring(0, entityID.indexOf("|")); - entityID = entityID.substring(entityID.indexOf("|") + 1); - } - - prepStatem.setInt(1, idSite); - prepStatem.setString(2, idVisit); - prepStatem.setString(3, country); - prepStatem.setString(4, action); - prepStatem.setString(5, url); - prepStatem.setString(6, entityID); - prepStatem.setString(7, sourceItemType); - prepStatem.setString(8, simpleDateFormat.format(timestamp)); - prepStatem.setString(9, referrerName); - prepStatem.setString(10, agent); - - prepStatem.addBatch(); - batch_size++; - if (batch_size == 10000) { - prepStatem.executeBatch(); - ConnectDB.getConnection().commit(); - batch_size = 0; - } - } - } - } - } - prepStatem.executeBatch(); - ConnectDB.getConnection().commit(); + System.out.println("====> Inserting into process_portal_log_tmp"); + String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_process_portal_log_tmp); + System.out.println("====> Inserted into process_portal_log_tmp"); stmt.close(); - ConnectDB.getConnection().close(); } + public void portalStats() throws SQLException { Connection con = ConnectDB.getConnection(); Statement stmt = con.createStatement(); @@ -845,135 +838,120 @@ public class PiwikStatsDB { ConnectDB.getConnection().setAutoCommit(false); System.out.println("====> Cleaning oai - Step 1"); - stmt = ConnectDB.getConnection().createStatement(); - String sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + stmt = ConnectDB.getConnection().createStatement(); + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 2"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 3"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 4"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 5"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 6"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 7"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 8"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 9"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 10"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 11"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 12"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 13"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 14"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 15"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; stmt.executeUpdate(sql); @@ -981,53 +959,47 @@ public class PiwikStatsDB { System.out.println("====> Cleaning oai - Step 16"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 17"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 18"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 19"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 20"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 21"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; stmt.executeUpdate(sql); @@ -1035,17 +1007,15 @@ public class PiwikStatsDB { System.out.println("====> Cleaning oai - Step 22"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 23"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; stmt.executeUpdate(sql); @@ -1053,17 +1023,15 @@ public class PiwikStatsDB { System.out.println("====> Cleaning oai - Step 24"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 25"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; stmt.executeUpdate(sql); @@ -1071,17 +1039,15 @@ public class PiwikStatsDB { System.out.println("====> Cleaning oai - Step 26"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 27"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; stmt.executeUpdate(sql); @@ -1089,23 +1055,20 @@ public class PiwikStatsDB { System.out.println("====> Cleaning oai - Step 28"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - + System.out.println("====> Cleaning oai - Step 29"); stmt = ConnectDB.getConnection().createStatement(); - sql = - "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; stmt.executeUpdate(sql); stmt.close(); - - + System.out.println("====> Cleaning oai - Done, closing connection"); ConnectDB.getConnection().close(); } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index f56885c078..2d66093c08 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -12,13 +12,13 @@ public class UsageStatsExporter { static String matomoAuthToken = "703bd17d845acdaf795e01bb1e0895b9"; static String matomoBaseURL = "analytics.openaire.eu"; - static String repoLogPath = "/user/spyros/logs/usage_stats_logs3/Repologs"; - static String portalLogPath = "/user/spyros/logs/usage_stats_logs3/Portallogs/"; + static String repoLogPath = "/user/spyros/logs/usage_stats_logs4/Repologs"; + static String portalLogPath = "/user/spyros/logs/usage_stats_logs4/Portallogs/"; static String portalMatomoID = "109"; static String irusUKBaseURL = "https://irus.jisc.ac.uk/api/sushilite/v1_7/"; - static String irusUKReportPath = "/user/spyros/logs/usage_stats_logs3/irusUKReports"; - static String sarcsReportPath = "/user/spyros/logs/usage_stats_logs3/sarcReports"; + static String irusUKReportPath = "/user/spyros/logs/usage_stats_logs4/irusUKReports"; + static String sarcsReportPath = "/user/spyros/logs/usage_stats_logs4/sarcReports"; public UsageStatsExporter(Properties properties) { this.properties = properties; @@ -39,7 +39,9 @@ public class UsageStatsExporter { // // the moment System.out.println("====> Initializing the download logs module"); PiwikDownloadLogs piwd = new PiwikDownloadLogs(matomoBaseURL, matomoAuthToken); + System.out.println("====> Downloading logs"); // piwd.GetOpenAIRELogs(repoLogPath, portalLogPath, portalMatomoID); + System.out.println("====> Downloaded logs"); // Create DB tables, insert/update statistics // String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");