From 65acece7c4eec6048fb367407ec7734bbf21632c Mon Sep 17 00:00:00 2001 From: Spyros Zoupanos Date: Mon, 21 Sep 2020 22:27:15 +0300 Subject: [PATCH] lareferencia removeDoubleClicks done --- .../usagestats/export/LaReferenciaStats.java | 119 +++++------------- .../graph/usagestats/export/PiwikStatsDB.java | 4 +- 2 files changed, 32 insertions(+), 91 deletions(-) diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java index 59f3c29963..d9d7ac22f9 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaStats.java @@ -104,12 +104,15 @@ public class LaReferenciaStats { public void processLogs() throws Exception { try { System.out.println("====> Processing LaReferencia repository logs"); - processlaReferenciaLog(); +// processlaReferenciaLog(); System.out.println("====> LaReferencia repository logs process done"); log.info("LaReferencia repository process done"); + + System.out.println("====> LaReferencia removing double clicks"); // removeDoubleClicks(); -// log.info("LaReferencia removing double clicks done"); -// viewsStats(); + System.out.println("====> LaReferencia removed double clicks"); + log.info("LaReferencia removing double clicks done"); + viewsStats(); // log.info("LaReferencia views done"); // downloadsStats(); // log.info("LaReferencia downloads done"); @@ -202,105 +205,43 @@ public class LaReferenciaStats { stmt.close(); } - public void processlaReferenciaLogOld() throws Exception { - - Statement stmt = ConnectDB.getConnection().createStatement(); - ConnectDB.getConnection().setAutoCommit(false); - ArrayList jsonFiles = listHdfsDir(this.logRepoPath); - - // File dir = new File(this.logRepoPath); - // File[] jsonFiles = dir.listFiles(); - PreparedStatement prepStatem = ConnectDB - .getConnection() - .prepareStatement( - "INSERT INTO lareferencialogtmp (matomoid, source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?,?)"); - int batch_size = 0; - - JSONParser parser = new JSONParser(); - for (String jsonFile : jsonFiles) { - System.out.println(jsonFile); - JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); - for (Object aJsonArray : jsonArray) { - JSONObject jsonObjectRow = (JSONObject) aJsonArray; - int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); - String idVisit = jsonObjectRow.get("idVisit").toString(); - String country = jsonObjectRow.get("country").toString(); - String referrerName = jsonObjectRow.get("referrerName").toString(); - String agent = jsonObjectRow.get("browser").toString(); - String sourceItemType = "repItem"; - - JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); - for (Object actionDetail : actionDetails) { - JSONObject actionDetailsObj = (JSONObject) actionDetail; - - if (actionDetailsObj.get("customVariables") != null) { - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - Timestamp timestamp = new Timestamp( - Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); - String url = actionDetailsObj.get("url").toString(); - String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1")) - .get("customVariablePageValue1") - .toString(); - String opendoar = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("2")) - .get("customVariablePageValue2") - .toString(); - String action = actionDetailsObj.get("type").toString(); - prepStatem.setInt(1, idSite); - prepStatem.setString(2, "opendoar____::" + opendoar); - prepStatem.setString(3, idVisit); - prepStatem.setString(4, country); - prepStatem.setString(5, action); - prepStatem.setString(6, url); - prepStatem.setString(7, oaipmh); - prepStatem.setString(8, sourceItemType); - prepStatem.setString(9, simpleDateFormat.format(timestamp)); - prepStatem.setString(10, referrerName); - prepStatem.setString(11, agent); - // prepStatem.setString(11, ); - prepStatem.addBatch(); - batch_size++; - if (batch_size == 10000) { - prepStatem.executeBatch(); - ConnectDB.getConnection().commit(); - batch_size = 0; - } - } - } - } - } - try { - prepStatem.executeBatch(); - ConnectDB.getConnection().commit(); - stmt.close(); - } catch (Exception e) { - - if (e instanceof java.sql.SQLException) { - java.sql.SQLException ne = ((java.sql.SQLException) e).getNextException(); - System.out.println(ne.getMessage()); - } - } - - } - public void removeDoubleClicks() throws Exception { Statement stmt = ConnectDB.getConnection().createStatement(); ConnectDB.getConnection().setAutoCommit(false); + System.out.println("====> Cleaning download double clicks"); // clean download double clicks - String sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp Cleaned download double clicks"); stmt = ConnectDB.getConnection().createStatement(); - + System.out.println("====> Cleaning action double clicks"); // clean view double clicks - sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp Cleaned action double clicks"); // conn.close(); } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java index c83102e615..b9ef6c168e 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -319,7 +319,7 @@ public class PiwikStatsDB { System.out.println("====> Cleaned download double clicks"); // clean view double clicks - System.out.println("====> Cleaning view double clicks"); + System.out.println("====> Cleaning action double clicks"); sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + "WHERE EXISTS (\n" + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + @@ -332,7 +332,7 @@ public class PiwikStatsDB { "AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" + "AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)"; stmt.executeUpdate(sql); - System.out.println("====> Cleaned view double clicks"); + System.out.println("====> Cleaned action double clicks"); stmt.close(); }