forked from D-Net/dnet-hadoop
More progress. Cleaning view double clicks
This commit is contained in:
parent
81102dd791
commit
398f1f6f15
|
@ -216,7 +216,6 @@ public class PiwikStatsDB {
|
|||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
ConnectDB.getConnection().setAutoCommit(false);
|
||||
|
||||
|
||||
System.out.println("====> Droping piwiklogtmp_json table");
|
||||
String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
|
@ -224,7 +223,6 @@ public class PiwikStatsDB {
|
|||
stmt.executeUpdate(drop_piwiklogtmp_json);
|
||||
System.out.println("====> Dropped piwiklogtmp_json table");
|
||||
|
||||
|
||||
System.out.println("====> Creating piwiklogtmp_json");
|
||||
String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
|
@ -253,14 +251,12 @@ public class PiwikStatsDB {
|
|||
stmt.executeUpdate(create_piwiklogtmp_json);
|
||||
System.out.println("====> Created piwiklogtmp_json");
|
||||
|
||||
|
||||
System.out.println("====> Droping piwiklogtmp table");
|
||||
String drop_piwiklogtmp = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".piwiklogtmp";
|
||||
stmt.executeUpdate(drop_piwiklogtmp);
|
||||
System.out.println("====> Created piwiklogtmp_json");
|
||||
|
||||
|
||||
System.out.println("====> Creating piwiklogtmp");
|
||||
String create_piwiklogtmp = "CREATE TABLE " +
|
||||
|
@ -270,12 +266,10 @@ public class PiwikStatsDB {
|
|||
"clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
stmt.executeUpdate(create_piwiklogtmp);
|
||||
System.out.println("====> Created piwiklogtmp");
|
||||
|
||||
|
||||
|
||||
System.out.println("====> Adding JSON Serde jar");
|
||||
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
|
||||
System.out.println("====> Added JSON Serde jar");
|
||||
|
||||
|
||||
System.out.println("====> Inserting into piwiklogtmp");
|
||||
String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
||||
|
@ -289,91 +283,14 @@ public class PiwikStatsDB {
|
|||
stmt.executeUpdate(insert_piwiklogtmp);
|
||||
System.out.println("====> Inserted into piwiklogtmp");
|
||||
|
||||
ConnectDB.getConnection().commit();
|
||||
stmt.close();
|
||||
|
||||
// ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
|
||||
//// File dir = new File(this.logRepoPath);
|
||||
//// File[] jsonFiles = dir.listFiles();
|
||||
//
|
||||
//
|
||||
// PreparedStatement prepStatem = ConnectDB
|
||||
// .getConnection()
|
||||
// .prepareStatement(
|
||||
// "INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
|
||||
// int batch_size = 0;
|
||||
// JSONParser parser = new JSONParser();
|
||||
// for (String jsonFile : jsonFiles) {
|
||||
// System.out.println(jsonFile);
|
||||
// JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
|
||||
// for (Object aJsonArray : jsonArray) {
|
||||
// JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
// int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
|
||||
// String idVisit = jsonObjectRow.get("idVisit").toString();
|
||||
// String country = jsonObjectRow.get("country").toString();
|
||||
// String referrerName = jsonObjectRow.get("referrerName").toString();
|
||||
// String agent = jsonObjectRow.get("browser").toString();
|
||||
// boolean botFound = false;
|
||||
// Iterator it = robotsList.iterator();
|
||||
// while (it.hasNext()) {
|
||||
// // Create a Pattern object
|
||||
// Pattern r = Pattern.compile(it.next().toString());
|
||||
// // Now create matcher object.
|
||||
// Matcher m = r.matcher(agent);
|
||||
// if (m.find()) {
|
||||
// // System.out.println("Found value: " + m.group(0));
|
||||
// botFound = true;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// if (botFound == false) {
|
||||
// String sourceItemType = "repItem";
|
||||
//
|
||||
// JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
|
||||
// for (Object actionDetail : actionDetails) {
|
||||
// JSONObject actionDetailsObj = (JSONObject) actionDetail;
|
||||
//
|
||||
// if (actionDetailsObj.get("customVariables") != null) {
|
||||
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
// simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
// Timestamp timestamp = new Timestamp(
|
||||
// Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
|
||||
// String url = actionDetailsObj.get("url").toString();
|
||||
// String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables"))
|
||||
// .get("1")).get("customVariablePageValue1").toString();
|
||||
// String action = actionDetailsObj.get("type").toString();
|
||||
//
|
||||
// prepStatem.setInt(1, idSite);
|
||||
// prepStatem.setString(2, idVisit);
|
||||
// prepStatem.setString(3, country);
|
||||
// prepStatem.setString(4, action);
|
||||
// prepStatem.setString(5, url);
|
||||
// prepStatem.setString(6, oaipmh);
|
||||
// prepStatem.setString(7, sourceItemType);
|
||||
// prepStatem.setString(8, simpleDateFormat.format(timestamp));
|
||||
// prepStatem.setString(9, referrerName);
|
||||
// prepStatem.setString(10, agent);
|
||||
// prepStatem.addBatch();
|
||||
// batch_size++;
|
||||
// if (batch_size == 10000) {
|
||||
// prepStatem.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// batch_size = 0;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// prepStatem.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// stmt.close();
|
||||
}
|
||||
|
||||
public void removeDoubleClicks() throws Exception {
|
||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
ConnectDB.getConnection().setAutoCommit(false);
|
||||
|
||||
System.out.println("====> Cleaning download double clicks");
|
||||
// clean download double clicks
|
||||
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
||||
"WHERE EXISTS (\n" +
|
||||
|
@ -387,17 +304,24 @@ public class PiwikStatsDB {
|
|||
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
|
||||
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
|
||||
stmt.executeUpdate(sql);
|
||||
stmt.close();
|
||||
|
||||
System.exit(0);
|
||||
|
||||
stmt = ConnectDB.getConnection().createStatement();
|
||||
System.out.println("====> Cleaned download double clicks");
|
||||
|
||||
// clean view double clicks
|
||||
sql = "DELETE FROM piwiklogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from piwiklogtmp p1, piwiklogtmp p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
|
||||
System.out.println("====> Cleaning view double clicks");
|
||||
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
||||
"WHERE EXISTS (\n" +
|
||||
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" +
|
||||
"WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
|
||||
+
|
||||
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp \n" +
|
||||
"AND p1.timestamp<p2.timestamp AND (unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))<10 \n" +
|
||||
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
|
||||
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
|
||||
stmt.executeUpdate(sql);
|
||||
System.out.println("====> Cleaned view double clicks");
|
||||
stmt.close();
|
||||
ConnectDB.getConnection().commit();
|
||||
}
|
||||
|
||||
public void viewsStats() throws Exception {
|
||||
|
@ -833,6 +757,14 @@ public class PiwikStatsDB {
|
|||
String sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%';";
|
||||
stmt.executeUpdate(sql);
|
||||
stmt.close();
|
||||
|
||||
|
||||
sql =
|
||||
"UPDATE usagestats.piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') \n" +
|
||||
"WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
|
||||
|
||||
System.exit(0);
|
||||
|
||||
ConnectDB.getConnection().commit();
|
||||
|
||||
stmt = ConnectDB.getConnection().createStatement();
|
||||
|
|
Loading…
Reference in New Issue