forked from D-Net/dnet-hadoop
More progress on adding piwiklogtmp to the code
This commit is contained in:
parent
f3dda9858c
commit
293d6accd4
|
@ -197,82 +197,123 @@ public class PiwikStatsDB {
|
|||
public void processRepositoryLog() throws Exception {
|
||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
ConnectDB.getConnection().setAutoCommit(false);
|
||||
|
||||
ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
|
||||
// File dir = new File(this.logRepoPath);
|
||||
// File[] jsonFiles = dir.listFiles();
|
||||
|
||||
PreparedStatement prepStatem = ConnectDB
|
||||
.getConnection()
|
||||
.prepareStatement(
|
||||
"INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
|
||||
int batch_size = 0;
|
||||
JSONParser parser = new JSONParser();
|
||||
for (String jsonFile : jsonFiles) {
|
||||
System.out.println(jsonFile);
|
||||
JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
|
||||
for (Object aJsonArray : jsonArray) {
|
||||
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
|
||||
String idVisit = jsonObjectRow.get("idVisit").toString();
|
||||
String country = jsonObjectRow.get("country").toString();
|
||||
String referrerName = jsonObjectRow.get("referrerName").toString();
|
||||
String agent = jsonObjectRow.get("browser").toString();
|
||||
boolean botFound = false;
|
||||
Iterator it = robotsList.iterator();
|
||||
while (it.hasNext()) {
|
||||
// Create a Pattern object
|
||||
Pattern r = Pattern.compile(it.next().toString());
|
||||
// Now create matcher object.
|
||||
Matcher m = r.matcher(agent);
|
||||
if (m.find()) {
|
||||
// System.out.println("Found value: " + m.group(0));
|
||||
botFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (botFound == false) {
|
||||
String sourceItemType = "repItem";
|
||||
|
||||
JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
|
||||
for (Object actionDetail : actionDetails) {
|
||||
JSONObject actionDetailsObj = (JSONObject) actionDetail;
|
||||
|
||||
if (actionDetailsObj.get("customVariables") != null) {
|
||||
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
Timestamp timestamp = new Timestamp(
|
||||
Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
|
||||
String url = actionDetailsObj.get("url").toString();
|
||||
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables"))
|
||||
.get("1")).get("customVariablePageValue1").toString();
|
||||
String action = actionDetailsObj.get("type").toString();
|
||||
|
||||
prepStatem.setInt(1, idSite);
|
||||
prepStatem.setString(2, idVisit);
|
||||
prepStatem.setString(3, country);
|
||||
prepStatem.setString(4, action);
|
||||
prepStatem.setString(5, url);
|
||||
prepStatem.setString(6, oaipmh);
|
||||
prepStatem.setString(7, sourceItemType);
|
||||
prepStatem.setString(8, simpleDateFormat.format(timestamp));
|
||||
prepStatem.setString(9, referrerName);
|
||||
prepStatem.setString(10, agent);
|
||||
prepStatem.addBatch();
|
||||
batch_size++;
|
||||
if (batch_size == 10000) {
|
||||
prepStatem.executeBatch();
|
||||
ConnectDB.getConnection().commit();
|
||||
batch_size = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prepStatem.executeBatch();
|
||||
ConnectDB.getConnection().commit();
|
||||
|
||||
String stm_piwiklogtmp_json =
|
||||
"CREATE EXTERNAL TABLE IF NOT EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".piwiklogtmp_json(\n" +
|
||||
" `idSite` STRING,\n" +
|
||||
" `idVisit` STRING,\n" +
|
||||
" `country` STRING,\n" +
|
||||
" `referrerName` STRING,\n" +
|
||||
" `browser` STRING,\n" +
|
||||
" `actionDetails` ARRAY<\n" +
|
||||
" struct<\n" +
|
||||
" type: STRING,\n" +
|
||||
" url: STRING,\n" +
|
||||
" `customVariables`: struct<\n" +
|
||||
" `1`: struct<\n" +
|
||||
" `customVariablePageValue1`: STRING\n" +
|
||||
" >\n" +
|
||||
" >,\n" +
|
||||
" timestamp: String\n" +
|
||||
" >\n" +
|
||||
" >\n" +
|
||||
")\n" +
|
||||
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
|
||||
"LOCATION '/user/spyros/logs/usage_stats_logs/Repologs2/'\n" +
|
||||
"TBLPROPERTIES (\"transactional\"=\"false\");\n" +
|
||||
"";
|
||||
stmt.executeUpdate(stm_piwiklogtmp_json);
|
||||
|
||||
|
||||
String stm_piwiklogtmp =
|
||||
"CREATE TABLE " +
|
||||
ConnectDB.getUsageStatsDBSchema() +
|
||||
".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " +
|
||||
"entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
|
||||
"clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true');";
|
||||
stmt.executeUpdate(processRepositoryLog);
|
||||
|
||||
|
||||
stmt.close();
|
||||
|
||||
// ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath);
|
||||
//// File dir = new File(this.logRepoPath);
|
||||
//// File[] jsonFiles = dir.listFiles();
|
||||
//
|
||||
//
|
||||
// PreparedStatement prepStatem = ConnectDB
|
||||
// .getConnection()
|
||||
// .prepareStatement(
|
||||
// "INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
|
||||
// int batch_size = 0;
|
||||
// JSONParser parser = new JSONParser();
|
||||
// for (String jsonFile : jsonFiles) {
|
||||
// System.out.println(jsonFile);
|
||||
// JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
|
||||
// for (Object aJsonArray : jsonArray) {
|
||||
// JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
// int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
|
||||
// String idVisit = jsonObjectRow.get("idVisit").toString();
|
||||
// String country = jsonObjectRow.get("country").toString();
|
||||
// String referrerName = jsonObjectRow.get("referrerName").toString();
|
||||
// String agent = jsonObjectRow.get("browser").toString();
|
||||
// boolean botFound = false;
|
||||
// Iterator it = robotsList.iterator();
|
||||
// while (it.hasNext()) {
|
||||
// // Create a Pattern object
|
||||
// Pattern r = Pattern.compile(it.next().toString());
|
||||
// // Now create matcher object.
|
||||
// Matcher m = r.matcher(agent);
|
||||
// if (m.find()) {
|
||||
// // System.out.println("Found value: " + m.group(0));
|
||||
// botFound = true;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// if (botFound == false) {
|
||||
// String sourceItemType = "repItem";
|
||||
//
|
||||
// JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
|
||||
// for (Object actionDetail : actionDetails) {
|
||||
// JSONObject actionDetailsObj = (JSONObject) actionDetail;
|
||||
//
|
||||
// if (actionDetailsObj.get("customVariables") != null) {
|
||||
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
// simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
// Timestamp timestamp = new Timestamp(
|
||||
// Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
|
||||
// String url = actionDetailsObj.get("url").toString();
|
||||
// String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables"))
|
||||
// .get("1")).get("customVariablePageValue1").toString();
|
||||
// String action = actionDetailsObj.get("type").toString();
|
||||
//
|
||||
// prepStatem.setInt(1, idSite);
|
||||
// prepStatem.setString(2, idVisit);
|
||||
// prepStatem.setString(3, country);
|
||||
// prepStatem.setString(4, action);
|
||||
// prepStatem.setString(5, url);
|
||||
// prepStatem.setString(6, oaipmh);
|
||||
// prepStatem.setString(7, sourceItemType);
|
||||
// prepStatem.setString(8, simpleDateFormat.format(timestamp));
|
||||
// prepStatem.setString(9, referrerName);
|
||||
// prepStatem.setString(10, agent);
|
||||
// prepStatem.addBatch();
|
||||
// batch_size++;
|
||||
// if (batch_size == 10000) {
|
||||
// prepStatem.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// batch_size = 0;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// prepStatem.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// stmt.close();
|
||||
}
|
||||
|
||||
public void removeDoubleClicks() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue