forked from D-Net/dnet-hadoop
lareferencia removeDoubleClicks done
This commit is contained in:
parent
0369f36776
commit
65acece7c4
|
@ -104,12 +104,15 @@ public class LaReferenciaStats {
|
||||||
public void processLogs() throws Exception {
|
public void processLogs() throws Exception {
|
||||||
try {
|
try {
|
||||||
System.out.println("====> Processing LaReferencia repository logs");
|
System.out.println("====> Processing LaReferencia repository logs");
|
||||||
processlaReferenciaLog();
|
// processlaReferenciaLog();
|
||||||
System.out.println("====> LaReferencia repository logs process done");
|
System.out.println("====> LaReferencia repository logs process done");
|
||||||
log.info("LaReferencia repository process done");
|
log.info("LaReferencia repository process done");
|
||||||
|
|
||||||
|
System.out.println("====> LaReferencia removing double clicks");
|
||||||
// removeDoubleClicks();
|
// removeDoubleClicks();
|
||||||
// log.info("LaReferencia removing double clicks done");
|
System.out.println("====> LaReferencia removed double clicks");
|
||||||
// viewsStats();
|
log.info("LaReferencia removing double clicks done");
|
||||||
|
viewsStats();
|
||||||
// log.info("LaReferencia views done");
|
// log.info("LaReferencia views done");
|
||||||
// downloadsStats();
|
// downloadsStats();
|
||||||
// log.info("LaReferencia downloads done");
|
// log.info("LaReferencia downloads done");
|
||||||
|
@ -202,105 +205,43 @@ public class LaReferenciaStats {
|
||||||
stmt.close();
|
stmt.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void processlaReferenciaLogOld() throws Exception {
|
|
||||||
|
|
||||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
|
||||||
ConnectDB.getConnection().setAutoCommit(false);
|
|
||||||
ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
|
|
||||||
|
|
||||||
// File dir = new File(this.logRepoPath);
|
|
||||||
// File[] jsonFiles = dir.listFiles();
|
|
||||||
PreparedStatement prepStatem = ConnectDB
|
|
||||||
.getConnection()
|
|
||||||
.prepareStatement(
|
|
||||||
"INSERT INTO lareferencialogtmp (matomoid, source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?,?)");
|
|
||||||
int batch_size = 0;
|
|
||||||
|
|
||||||
JSONParser parser = new JSONParser();
|
|
||||||
for (String jsonFile : jsonFiles) {
|
|
||||||
System.out.println(jsonFile);
|
|
||||||
JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
|
|
||||||
for (Object aJsonArray : jsonArray) {
|
|
||||||
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
|
||||||
int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
|
|
||||||
String idVisit = jsonObjectRow.get("idVisit").toString();
|
|
||||||
String country = jsonObjectRow.get("country").toString();
|
|
||||||
String referrerName = jsonObjectRow.get("referrerName").toString();
|
|
||||||
String agent = jsonObjectRow.get("browser").toString();
|
|
||||||
String sourceItemType = "repItem";
|
|
||||||
|
|
||||||
JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
|
|
||||||
for (Object actionDetail : actionDetails) {
|
|
||||||
JSONObject actionDetailsObj = (JSONObject) actionDetail;
|
|
||||||
|
|
||||||
if (actionDetailsObj.get("customVariables") != null) {
|
|
||||||
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
|
||||||
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
|
||||||
Timestamp timestamp = new Timestamp(
|
|
||||||
Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
|
|
||||||
String url = actionDetailsObj.get("url").toString();
|
|
||||||
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1"))
|
|
||||||
.get("customVariablePageValue1")
|
|
||||||
.toString();
|
|
||||||
String opendoar = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("2"))
|
|
||||||
.get("customVariablePageValue2")
|
|
||||||
.toString();
|
|
||||||
String action = actionDetailsObj.get("type").toString();
|
|
||||||
prepStatem.setInt(1, idSite);
|
|
||||||
prepStatem.setString(2, "opendoar____::" + opendoar);
|
|
||||||
prepStatem.setString(3, idVisit);
|
|
||||||
prepStatem.setString(4, country);
|
|
||||||
prepStatem.setString(5, action);
|
|
||||||
prepStatem.setString(6, url);
|
|
||||||
prepStatem.setString(7, oaipmh);
|
|
||||||
prepStatem.setString(8, sourceItemType);
|
|
||||||
prepStatem.setString(9, simpleDateFormat.format(timestamp));
|
|
||||||
prepStatem.setString(10, referrerName);
|
|
||||||
prepStatem.setString(11, agent);
|
|
||||||
// prepStatem.setString(11, );
|
|
||||||
prepStatem.addBatch();
|
|
||||||
batch_size++;
|
|
||||||
if (batch_size == 10000) {
|
|
||||||
prepStatem.executeBatch();
|
|
||||||
ConnectDB.getConnection().commit();
|
|
||||||
batch_size = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
prepStatem.executeBatch();
|
|
||||||
ConnectDB.getConnection().commit();
|
|
||||||
stmt.close();
|
|
||||||
} catch (Exception e) {
|
|
||||||
|
|
||||||
if (e instanceof java.sql.SQLException) {
|
|
||||||
java.sql.SQLException ne = ((java.sql.SQLException) e).getNextException();
|
|
||||||
System.out.println(ne.getMessage());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void removeDoubleClicks() throws Exception {
|
public void removeDoubleClicks() throws Exception {
|
||||||
|
|
||||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||||
ConnectDB.getConnection().setAutoCommit(false);
|
ConnectDB.getConnection().setAutoCommit(false);
|
||||||
|
|
||||||
|
System.out.println("====> Cleaning download double clicks");
|
||||||
// clean download double clicks
|
// clean download double clicks
|
||||||
String sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<30 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
|
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
|
||||||
|
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
|
||||||
|
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
|
||||||
|
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
|
||||||
|
"AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " +
|
||||||
|
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 " +
|
||||||
|
"AND lareferencialogtmp.source=p1.source AND lareferencialogtmp.id_visit=p1.id_visit " +
|
||||||
|
"AND lareferencialogtmp.action=p1.action AND lareferencialogtmp.entity_id=p1.entity_id " +
|
||||||
|
"AND lareferencialogtmp.timestamp=p1.timestamp)";
|
||||||
stmt.executeUpdate(sql);
|
stmt.executeUpdate(sql);
|
||||||
stmt.close();
|
stmt.close();
|
||||||
ConnectDB.getConnection().commit();
|
System.out.println("====> Cleaned download double clicks");
|
||||||
|
|
||||||
stmt = ConnectDB.getConnection().createStatement();
|
stmt = ConnectDB.getConnection().createStatement();
|
||||||
|
System.out.println("====> Cleaning action double clicks");
|
||||||
// clean view double clicks
|
// clean view double clicks
|
||||||
sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
|
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
|
||||||
|
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
|
||||||
|
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
|
||||||
|
ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
|
||||||
|
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
|
||||||
|
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp " +
|
||||||
|
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<10 " +
|
||||||
|
"AND lareferencialogtmp.source=p1.source AND lareferencialogtmp.id_visit=p1.id_visit " +
|
||||||
|
"AND lareferencialogtmp.action=p1.action AND lareferencialogtmp.entity_id=p1.entity_id " +
|
||||||
|
"AND lareferencialogtmp.timestamp=p1.timestamp)";
|
||||||
stmt.executeUpdate(sql);
|
stmt.executeUpdate(sql);
|
||||||
stmt.close();
|
stmt.close();
|
||||||
ConnectDB.getConnection().commit();
|
System.out.println("====> Cleaned action double clicks");
|
||||||
// conn.close();
|
// conn.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -319,7 +319,7 @@ public class PiwikStatsDB {
|
||||||
System.out.println("====> Cleaned download double clicks");
|
System.out.println("====> Cleaned download double clicks");
|
||||||
|
|
||||||
// clean view double clicks
|
// clean view double clicks
|
||||||
System.out.println("====> Cleaning view double clicks");
|
System.out.println("====> Cleaning action double clicks");
|
||||||
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
||||||
"WHERE EXISTS (\n" +
|
"WHERE EXISTS (\n" +
|
||||||
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
|
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
|
||||||
|
@ -332,7 +332,7 @@ public class PiwikStatsDB {
|
||||||
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
|
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
|
||||||
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
|
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
|
||||||
stmt.executeUpdate(sql);
|
stmt.executeUpdate(sql);
|
||||||
System.out.println("====> Cleaned view double clicks");
|
System.out.println("====> Cleaned action double clicks");
|
||||||
stmt.close();
|
stmt.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue