forked from D-Net/dnet-hadoop
More progress - Adding queries to code
This commit is contained in:
parent
8db9a7ccdc
commit
f3dda9858c
|
@ -70,34 +70,33 @@ public class PiwikStatsDB {
|
|||
private void createTables() throws Exception {
|
||||
try {
|
||||
stmt = ConnectDB.getConnection().createStatement();
|
||||
String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
String sqlcreateRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
|
||||
+ " ON INSERT TO piwiklog "
|
||||
+ " WHERE (EXISTS ( SELECT piwiklog.source, piwiklog.id_visit,"
|
||||
+ "piwiklog.action, piwiklog.\"timestamp\", piwiklog.entity_id "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + "piwiklog "
|
||||
+ "WHERE piwiklog.source = new.source AND piwiklog.id_visit = new.id_visit AND piwiklog.action = new.action AND piwiklog.entity_id = new.entity_id AND piwiklog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING";
|
||||
String sqlCreateRuleIndexPiwikLog = "create index if not exists piwiklog_rule on "
|
||||
+ ConnectDB.getUsageStatsDBSchema() + "piwiklog(source, id_visit, action, entity_id, \"timestamp\")";
|
||||
|
||||
// Create Piwiklog table - This table should exist
|
||||
String sqlCreateTablePiwikLog =
|
||||
"CREATE TABLE IF NOT EXISTS"
|
||||
+ ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
|
||||
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
|
||||
+ "clustered by (source, id_visit, action, timestamp, entity_id) "
|
||||
+ "into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
stmt.executeUpdate(sqlCreateTablePiwikLog);
|
||||
// stmt.executeUpdate(sqlcreateRulePiwikLog); --> We need to find a way to eliminate duplicates
|
||||
// stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); --> We probably don't need indexes
|
||||
|
||||
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
String sqlcreateRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
|
||||
+ " ON INSERT TO " + ConnectDB.getUsageStatsDBSchema() + "process_portal_log "
|
||||
+ " WHERE (EXISTS ( SELECT process_portal_log.source, process_portal_log.id_visit,"
|
||||
+ "process_portal_log.\"timestamp\" "
|
||||
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + "process_portal_log "
|
||||
+ "WHERE process_portal_log.source = new.source AND process_portal_log.id_visit = new.id_visit AND process_portal_log.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
|
||||
String sqlCreateRuleIndexPortalLog = "create index if not exists process_portal_log_rule on "
|
||||
+ ConnectDB.getUsageStatsDBSchema() + "process_portal_log(source, id_visit, \"timestamp\");";
|
||||
|
||||
/////////////////////////////////////////
|
||||
// Rule for duplicate inserts @ piwiklog
|
||||
/////////////////////////////////////////
|
||||
|
||||
String sqlCreateTablePortalLog =
|
||||
"CREATE TABLE IF NOT EXISTS "
|
||||
+ ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
|
||||
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
|
||||
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
stmt.executeUpdate(sqlCreateTablePortalLog);
|
||||
// stmt.executeUpdate(sqlcreateRulePortalLog); --> We need to find a way to eliminate duplicates
|
||||
// stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); --> We probably don't need indexes
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Rule for duplicate inserts @ process_portal_log
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
stmt.close();
|
||||
ConnectDB.getConnection().close();
|
||||
log.info("Usage Tables Created");
|
||||
|
@ -111,28 +110,38 @@ public class PiwikStatsDB {
|
|||
private void createTmpTables() throws Exception {
|
||||
try {
|
||||
Statement stmt = ConnectDB.getConnection().createStatement();
|
||||
String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
||||
+ "piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
String sqlcreateTmpRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
|
||||
+ " ON INSERT TO " + ConnectDB.getUsageStatsDBSchema() + "piwiklogtmp "
|
||||
+ " WHERE (EXISTS ( SELECT piwiklogtmp.source, piwiklogtmp.id_visit,"
|
||||
+ "piwiklogtmp.action, piwiklogtmp.\"timestamp\", piwiklogtmp.entity_id "
|
||||
+ "FROM piwiklogtmp "
|
||||
+ "WHERE piwiklogtmp.source = new.source AND piwiklogtmp.id_visit = new.id_visit AND piwiklogtmp.action = new.action AND piwiklogtmp.entity_id = new.entity_id AND piwiklogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
|
||||
String sqlCreateTmpTablePiwikLog =
|
||||
"CREATE TABLE IF NOT EXISTS "
|
||||
+ ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".piwiklogtmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
|
||||
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
|
||||
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
|
||||
+ "stored as orc tblproperties('transactional'='true');";
|
||||
stmt.executeUpdate(sqlCreateTmpTablePiwikLog);
|
||||
// stmt.executeUpdate(sqlcreateTmpRulePiwikLog); --> We need to find a way to eliminate duplicates
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Rule for duplicate inserts @ piwiklogtmp
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Copy from public.piwiklog to piwiklog
|
||||
//////////////////////////////////////////////////
|
||||
// String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
|
||||
// stmt.executeUpdate(sqlCopyPublicPiwiklog);
|
||||
String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
String sqlcreateTmpRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
|
||||
+ " ON INSERT TO process_portal_log_tmp "
|
||||
+ " WHERE (EXISTS ( SELECT process_portal_log_tmp.source, process_portal_log_tmp.id_visit,"
|
||||
+ "process_portal_log_tmp.\"timestamp\" "
|
||||
+ "FROM process_portal_log_tmp "
|
||||
+ "WHERE process_portal_log_tmp.source = new.source AND process_portal_log_tmp.id_visit = new.id_visit AND process_portal_log_tmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
|
||||
|
||||
|
||||
|
||||
String sqlCreateTmpTablePortalLog =
|
||||
"CREATE TABLE IF NOT EXISTS "
|
||||
+ ConnectDB.getUsageStatsDBSchema()
|
||||
+ ".process_portal_log_tmp(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
|
||||
+ "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
|
||||
+ "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
||||
stmt.executeUpdate(sqlCreateTmpTablePortalLog);
|
||||
// stmt.executeUpdate(sqlcreateTmpRulePortalLog); --> We need to find a way to eliminate duplicates
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Rule for duplicate inserts @ process_portal_log_tmp
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
stmt.close();
|
||||
log.info("Usage Tmp Tables Created");
|
||||
|
|
|
@ -188,62 +188,64 @@ public class SarcStats {
|
|||
fin.writeChar('\n');
|
||||
}
|
||||
fin.close();
|
||||
|
||||
// JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
// JSONArray itemIdentifier = new JSONArray();
|
||||
// obj = jsonObjectRow.get("c:ItemIdentifier");
|
||||
// if (obj instanceof JSONObject) {
|
||||
// itemIdentifier.add(obj);
|
||||
// } else {
|
||||
// // JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("c:ItemIdentifier");
|
||||
// itemIdentifier = (JSONArray) obj;
|
||||
// }
|
||||
// for (Object identifier : itemIdentifier) {
|
||||
// JSONObject doi = (JSONObject) identifier;
|
||||
// if (doi.get("c:Type").toString().equals("DOI")) {
|
||||
// rid = doi.get("c:Value").toString();
|
||||
// // System.out.println("DOI: " + rid);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// if (rid.isEmpty()) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// JSONObject itemPerformance = (JSONObject) jsonObjectRow.get("c:ItemPerformance");
|
||||
// // for (Object perf : itemPerformance) {
|
||||
// JSONObject performance = (JSONObject) itemPerformance;
|
||||
// JSONObject periodObj = (JSONObject) performance.get("c:Period");
|
||||
// String period = periodObj.get("c:Begin").toString();
|
||||
// JSONObject instanceObj = (JSONObject) performance.get("c:Instance");
|
||||
// String type = instanceObj.get("c:MetricType").toString();
|
||||
// String count = instanceObj.get("c:Count").toString();
|
||||
// // System.out.println(rid + " : " + period + " : " + count);
|
||||
//
|
||||
// preparedStatement.setString(1, "SARC-OJS");
|
||||
// preparedStatement.setString(2, issn);
|
||||
// // preparedStatement.setString(2, url);
|
||||
// preparedStatement.setString(3, rid);
|
||||
// preparedStatement.setString(4, period);
|
||||
// preparedStatement.setString(5, type);
|
||||
// preparedStatement.setInt(6, Integer.parseInt(count));
|
||||
// preparedStatement.addBatch();
|
||||
// batch_size++;
|
||||
// if (batch_size == 10000) {
|
||||
// preparedStatement.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// batch_size = 0;
|
||||
// }
|
||||
// // }
|
||||
//
|
||||
// // break;
|
||||
//////////////////
|
||||
// JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
// JSONArray itemIdentifier = new JSONArray();
|
||||
// obj = jsonObjectRow.get("c:ItemIdentifier");
|
||||
// if (obj instanceof JSONObject) {
|
||||
// itemIdentifier.add(obj);
|
||||
// } else {
|
||||
// // JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("c:ItemIdentifier");
|
||||
// itemIdentifier = (JSONArray) obj;
|
||||
// }
|
||||
// for (Object identifier : itemIdentifier) {
|
||||
// JSONObject doi = (JSONObject) identifier;
|
||||
// if (doi.get("c:Type").toString().equals("DOI")) {
|
||||
// rid = doi.get("c:Value").toString();
|
||||
// // System.out.println("DOI: " + rid);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// if (rid.isEmpty()) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// JSONObject itemPerformance = (JSONObject) jsonObjectRow.get("c:ItemPerformance");
|
||||
// // for (Object perf : itemPerformance) {
|
||||
// JSONObject performance = (JSONObject) itemPerformance;
|
||||
// JSONObject periodObj = (JSONObject) performance.get("c:Period");
|
||||
// String period = periodObj.get("c:Begin").toString();
|
||||
// JSONObject instanceObj = (JSONObject) performance.get("c:Instance");
|
||||
// String type = instanceObj.get("c:MetricType").toString();
|
||||
// String count = instanceObj.get("c:Count").toString();
|
||||
// // System.out.println(rid + " : " + period + " : " + count);
|
||||
//
|
||||
// preparedStatement.setString(1, "SARC-OJS");
|
||||
// preparedStatement.setString(2, issn);
|
||||
// // preparedStatement.setString(2, url);
|
||||
// preparedStatement.setString(3, rid);
|
||||
// preparedStatement.setString(4, period);
|
||||
// preparedStatement.setString(5, type);
|
||||
// preparedStatement.setInt(6, Integer.parseInt(count));
|
||||
// preparedStatement.addBatch();
|
||||
// batch_size++;
|
||||
// if (batch_size == 10000) {
|
||||
// preparedStatement.executeBatch();
|
||||
// ConnectDB.getConnection().commit();
|
||||
// batch_size = 0;
|
||||
// }
|
||||
// // }
|
||||
//
|
||||
// // break;
|
||||
// }
|
||||
//////////////////
|
||||
// break;
|
||||
}
|
||||
|
||||
preparedStatement.executeBatch();
|
||||
ConnectDB.getConnection().commit();
|
||||
ConnectDB.getConnection().close();
|
||||
|
||||
}
|
||||
|
||||
private String getJson(String url) throws Exception {
|
||||
|
|
|
@ -38,22 +38,20 @@ public class UsageStatsExporter {
|
|||
// connect to DB
|
||||
ConnectDB.init(properties);
|
||||
|
||||
// // Create DB tables - they are also needed to download the statistics too
|
||||
// PiwikStatsDB piwikstatsdb = new PiwikStatsDB(repoLogPath, portalLogPath);
|
||||
// Create DB tables - they are also needed to download the statistics too
|
||||
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(repoLogPath, portalLogPath);
|
||||
//
|
||||
// // Download the statistics - The following 2 lines are not needed after the download - Commenting them out for
|
||||
// // the moment
|
||||
// PiwikDownloadLogs piwd = new PiwikDownloadLogs(matomoBaseURL, matomoAuthToken);
|
||||
// piwd.GetOpenAIRELogs(repoLogPath, portalLogPath, portalMatomoID);
|
||||
//
|
||||
// System.exit(0);
|
||||
//
|
||||
// // Create DB tables, insert/update statistics
|
||||
//// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");
|
||||
// String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
|
||||
// piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
|
||||
// piwikstatsdb.processLogs();
|
||||
// log.info("process logs done");
|
||||
PiwikDownloadLogs piwd = new PiwikDownloadLogs(matomoBaseURL, matomoAuthToken);
|
||||
piwd.GetOpenAIRELogs(repoLogPath, portalLogPath, portalMatomoID);
|
||||
|
||||
// Create DB tables, insert/update statistics
|
||||
// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");
|
||||
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
|
||||
piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
|
||||
piwikstatsdb.processLogs();
|
||||
log.info("process logs done");
|
||||
|
||||
// IrusStats irusstats = new IrusStats(irusUKBaseURL);
|
||||
// irusstats.processIrusRRReport(irusUKReportPath);
|
||||
|
|
Loading…
Reference in New Issue