forked from D-Net/dnet-hadoop
lareferencia removeDoubleClicks done
This commit is contained in:
parent
0369f36776
commit
65acece7c4
|
@ -104,12 +104,15 @@ public class LaReferenciaStats {
|
|||
public void processLogs() throws Exception {
|
||||
try {
|
||||
System.out.println("====> Processing LaReferencia repository logs");
|
||||
processlaReferenciaLog();
|
||||
// processlaReferenciaLog();
|
||||
System.out.println("====> LaReferencia repository logs process done");
|
||||
log.info("LaReferencia repository process done");
|
||||
|
||||
System.out.println("====> LaReferencia removing double clicks");
|
||||
// removeDoubleClicks();
|
||||
// log.info("LaReferencia removing double clicks done");
|
||||
// viewsStats();
|
||||
System.out.println("====> LaReferencia removed double clicks");
|
||||
log.info("LaReferencia removing double clicks done");
|
||||
viewsStats();
|
||||
// log.info("LaReferencia views done");
|
||||
// downloadsStats();
|
||||
// log.info("LaReferencia downloads done");
|
||||
|
@ -202,105 +205,43 @@ public class LaReferenciaStats {
|
|||
stmt.close();
|
||||
}
|
||||
|
||||
public void processlaReferenciaLogOld() throws Exception {
|
||||
|
||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
ConnectDB.getConnection().setAutoCommit(false);
|
||||
ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
|
||||
|
||||
// File dir = new File(this.logRepoPath);
|
||||
// File[] jsonFiles = dir.listFiles();
|
||||
PreparedStatement prepStatem = ConnectDB
|
||||
.getConnection()
|
||||
.prepareStatement(
|
||||
"INSERT INTO lareferencialogtmp (matomoid, source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?,?)");
|
||||
int batch_size = 0;
|
||||
|
||||
JSONParser parser = new JSONParser();
|
||||
for (String jsonFile : jsonFiles) {
|
||||
System.out.println(jsonFile);
|
||||
JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
|
||||
for (Object aJsonArray : jsonArray) {
|
||||
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
|
||||
String idVisit = jsonObjectRow.get("idVisit").toString();
|
||||
String country = jsonObjectRow.get("country").toString();
|
||||
String referrerName = jsonObjectRow.get("referrerName").toString();
|
||||
String agent = jsonObjectRow.get("browser").toString();
|
||||
String sourceItemType = "repItem";
|
||||
|
||||
JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
|
||||
for (Object actionDetail : actionDetails) {
|
||||
JSONObject actionDetailsObj = (JSONObject) actionDetail;
|
||||
|
||||
if (actionDetailsObj.get("customVariables") != null) {
|
||||
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
Timestamp timestamp = new Timestamp(
|
||||
Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
|
||||
String url = actionDetailsObj.get("url").toString();
|
||||
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1"))
|
||||
.get("customVariablePageValue1")
|
||||
.toString();
|
||||
String opendoar = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("2"))
|
||||
.get("customVariablePageValue2")
|
||||
.toString();
|
||||
String action = actionDetailsObj.get("type").toString();
|
||||
prepStatem.setInt(1, idSite);
|
||||
prepStatem.setString(2, "opendoar____::" + opendoar);
|
||||
prepStatem.setString(3, idVisit);
|
||||
prepStatem.setString(4, country);
|
||||
prepStatem.setString(5, action);
|
||||
prepStatem.setString(6, url);
|
||||
prepStatem.setString(7, oaipmh);
|
||||
prepStatem.setString(8, sourceItemType);
|
||||
prepStatem.setString(9, simpleDateFormat.format(timestamp));
|
||||
prepStatem.setString(10, referrerName);
|
||||
prepStatem.setString(11, agent);
|
||||
// prepStatem.setString(11, );
|
||||
prepStatem.addBatch();
|
||||
batch_size++;
|
||||
if (batch_size == 10000) {
|
||||
prepStatem.executeBatch();
|
||||
ConnectDB.getConnection().commit();
|
||||
batch_size = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try {
|
||||
prepStatem.executeBatch();
|
||||
ConnectDB.getConnection().commit();
|
||||
stmt.close();
|
||||
} catch (Exception e) {
|
||||
|
||||
if (e instanceof java.sql.SQLException) {
|
||||
java.sql.SQLException ne = ((java.sql.SQLException) e).getNextException();
|
||||
System.out.println(ne.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void removeDoubleClicks() throws Exception {
|
||||
|
||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
ConnectDB.getConnection().setAutoCommit(false);
|
||||
|
||||
System.out.println("====> Cleaning download double clicks");
|
||||
// clean download double clicks
|
||||
String sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<30 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
|
||||
String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
|
||||
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
|
||||
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
|
||||
"AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " +
|
||||
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<30 " +
|
||||
"AND lareferencialogtmp.source=p1.source AND lareferencialogtmp.id_visit=p1.id_visit " +
|
||||
"AND lareferencialogtmp.action=p1.action AND lareferencialogtmp.entity_id=p1.entity_id " +
|
||||
"AND lareferencialogtmp.timestamp=p1.timestamp)";
|
||||
stmt.executeUpdate(sql);
|
||||
stmt.close();
|
||||
ConnectDB.getConnection().commit();
|
||||
System.out.println("====> Cleaned download double clicks");
|
||||
|
||||
stmt = ConnectDB.getConnection().createStatement();
|
||||
|
||||
System.out.println("====> Cleaning action double clicks");
|
||||
// clean view double clicks
|
||||
sql = "DELETE FROM lareferencialogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from lareferencialogtmp p1, lareferencialogtmp p2 WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
|
||||
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
|
||||
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
|
||||
"FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
|
||||
"WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
|
||||
"AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp " +
|
||||
"AND p1.timestamp<p2.timestamp AND ((unix_timestamp(p2.timestamp)-unix_timestamp(p1.timestamp))/60)<10 " +
|
||||
"AND lareferencialogtmp.source=p1.source AND lareferencialogtmp.id_visit=p1.id_visit " +
|
||||
"AND lareferencialogtmp.action=p1.action AND lareferencialogtmp.entity_id=p1.entity_id " +
|
||||
"AND lareferencialogtmp.timestamp=p1.timestamp)";
|
||||
stmt.executeUpdate(sql);
|
||||
stmt.close();
|
||||
ConnectDB.getConnection().commit();
|
||||
System.out.println("====> Cleaned action double clicks");
|
||||
// conn.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -319,7 +319,7 @@ public class PiwikStatsDB {
|
|||
System.out.println("====> Cleaned download double clicks");
|
||||
|
||||
// clean view double clicks
|
||||
System.out.println("====> Cleaning view double clicks");
|
||||
System.out.println("====> Cleaning action double clicks");
|
||||
sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
|
||||
"WHERE EXISTS (\n" +
|
||||
"SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" +
|
||||
|
@ -332,7 +332,7 @@ public class PiwikStatsDB {
|
|||
"AND piwiklogtmp.source=p1.source AND piwiklogtmp.id_visit=p1.id_visit \n" +
|
||||
"AND piwiklogtmp.action=p1.action AND piwiklogtmp.entity_id=p1.entity_id AND piwiklogtmp.timestamp=p1.timestamp)";
|
||||
stmt.executeUpdate(sql);
|
||||
System.out.println("====> Cleaned view double clicks");
|
||||
System.out.println("====> Cleaned action double clicks");
|
||||
stmt.close();
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue