More progress. Cleaning view double clicks

This commit is contained in:
Spyros Zoupanos 2020-09-07 21:57:45 +03:00
parent 81102dd791
commit 398f1f6f15
1 changed files with 24 additions and 92 deletions

View File

@ -216,7 +216,6 @@ public class PiwikStatsDB {
Statement stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
System.out.println("====> Droping piwiklogtmp_json table");
String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
@ -224,7 +223,6 @@ public class PiwikStatsDB {
stmt.executeUpdate(drop_piwiklogtmp_json);
System.out.println("====> Dropped piwiklogtmp_json table");
System.out.println("====> Creating piwiklogtmp_json");
String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
@ -253,14 +251,12 @@ public class PiwikStatsDB {
stmt.executeUpdate(create_piwiklogtmp_json);
System.out.println("====> Created piwiklogtmp_json");
System.out.println("====> Droping piwiklogtmp table");
String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() +
".piwiklogtmp";
stmt.executeUpdate(drop_piwiklogtmp);
System.out.println("====> Created piwiklogtmp_json");
System.out.println("====> Creating piwiklogtmp");
String create_piwiklogtmp = "CREATE TABLE " +
@ -270,12 +266,10 @@ public class PiwikStatsDB {
"clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(create_piwiklogtmp);
System.out.println("====> Created piwiklogtmp");
System.out.println("====> Adding JSON Serde jar");
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
System.out.println("====> Added JSON Serde jar");
System.out.println("====> Inserting into piwiklogtmp");
String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
@ -289,91 +283,14 @@ public class PiwikStatsDB {
stmt.executeUpdate(insert_piwiklogtmp);
System.out.println("====> Inserted into piwiklogtmp");
ConnectDB.getConnection().commit();
stmt.close();
// ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
//// File dir = new File(this.logRepoPath);
//// File[] jsonFiles = dir.listFiles();
//
//
// PreparedStatement prepStatem = ConnectDB
// .getConnection()
// .prepareStatement(
// "INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
// int batch_size = 0;
// JSONParser parser = new JSONParser();
// for (String jsonFile : jsonFiles) {
// System.out.println(jsonFile);
// JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
// for (Object aJsonArray : jsonArray) {
// JSONObject jsonObjectRow = (JSONObject) aJsonArray;
// int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
// String idVisit = jsonObjectRow.get("idVisit").toString();
// String country = jsonObjectRow.get("country").toString();
// String referrerName = jsonObjectRow.get("referrerName").toString();
// String agent = jsonObjectRow.get("browser").toString();
// boolean botFound = false;
// Iterator it = robotsList.iterator();
// while (it.hasNext()) {
// // Create a Pattern object
// Pattern r = Pattern.compile(it.next().toString());
// // Now create matcher object.
// Matcher m = r.matcher(agent);
// if (m.find()) {
// // System.out.println("Found value: " + m.group(0));
// botFound = true;
// break;
// }
// }
// if (botFound == false) {
// String sourceItemType = "repItem";
//
// JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
// for (Object actionDetail : actionDetails) {
// JSONObject actionDetailsObj = (JSONObject) actionDetail;
//
// if (actionDetailsObj.get("customVariables") != null) {
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
// simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
// Timestamp timestamp = new Timestamp(
// Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
// String url = actionDetailsObj.get("url").toString();
// String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables"))
// .get("1")).get("customVariablePageValue1").toString();
// String action = actionDetailsObj.get("type").toString();
//
// prepStatem.setInt(1, idSite);
// prepStatem.setString(2, idVisit);
// prepStatem.setString(3, country);
// prepStatem.setString(4, action);
// prepStatem.setString(5, url);
// prepStatem.setString(6, oaipmh);
// prepStatem.setString(7, sourceItemType);
// prepStatem.setString(8, simpleDateFormat.format(timestamp));
// prepStatem.setString(9, referrerName);
// prepStatem.setString(10, agent);
// prepStatem.addBatch();
// batch_size++;
// if (batch_size == 10000) {
// prepStatem.executeBatch();
// ConnectDB.getConnection().commit();
// batch_size = 0;
// }
// }
// }
// }
// }
// }
// prepStatem.executeBatch();
// ConnectDB.getConnection().commit();
// stmt.close();
}
public void removeDoubleClicks() throws Exception {
Statement stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
System.out.println("====> Cleaning download double clicks");
// clean download double clicks
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"WHERE EXISTS (\n" +
@ -387,17 +304,24 @@ public class PiwikStatsDB {
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql);
stmt.close();
System.exit(0);
stmt = ConnectDB.getConnection().createStatement();
System.out.println("====> Cleaned download double clicks");
// clean view double clicks
sql = "DELETE FROM piwiklogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from piwiklogtmp p1, piwiklogtmp p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
System.out.println("====> Cleaning view double clicks");
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
"WHERE EXISTS (\n" +
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
"WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n" +
"AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n" +
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
stmt.executeUpdate(sql);
System.out.println("====> Cleaned view double clicks");
stmt.close();
ConnectDB.getConnection().commit();
}
public void viewsStats() throws Exception {
@ -833,6 +757,14 @@ public class PiwikStatsDB {
String sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%';";
stmt.executeUpdate(sql);
stmt.close();
sql =
"UPDATE usagestats.piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') \n" +
"WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
System.exit(0);
ConnectDB.getConnection().commit();
stmt = ConnectDB.getConnection().createStatement();