More progress on LaReFerenciaLogs

This commit is contained in:
Spyros Zoupanos 2020-09-20 00:59:33 +03:00
parent 053588c365
commit 2b2bac9b28
3 changed files with 442 additions and 402 deletions

View File

@ -1,10 +1,5 @@
package eu.dnetlib.oa.graph.usagestats.export;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
package eu.dnetlib.oa.graph.usagestats.export;
import java.io.*;
import java.net.URL;
@ -13,8 +8,14 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
@ -26,7 +27,7 @@ public class LaReferenciaDownloadLogs {
private final String tokenAuth;
/*
The Piwik's API method
* The Piwik's API method
*/
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
private final String format = "&format=json";
@ -38,22 +39,31 @@ public class LaReferenciaDownloadLogs {
this.piwikUrl = piwikUrl;
this.tokenAuth = tokenAuth;
this.createTables();
this.createTmpTables();
// this.createTmpTables();
}
private void createTables() throws Exception {
try {
Statement stmt = ConnectDB.getConnection().createStatement();
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialog(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO lareferencialog "
+ " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
+ "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
+ "FROM lareferencialog "
+ "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
System.out.println("====> Creating LaReferencia tables");
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " +
ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " +
"source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
"source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
"clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
"stored as orc tblproperties('transactional'='true')";
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
System.out.println("====> Created LaReferencia tables");
// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
// + " ON INSERT TO lareferencialog "
// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
// + "FROM lareferencialog "
// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
stmt.close();
ConnectDB.getConnection().close();
@ -89,6 +99,7 @@ public class LaReferenciaDownloadLogs {
// System.exit(0);
}
}
private String getPiwikLogUrl() {
return piwikUrl + "/";
}
@ -143,19 +154,27 @@ public class LaReferenciaDownloadLogs {
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
PreparedStatement st = ConnectDB.getConnection().prepareStatement("SELECT max(timestamp) FROM lareferencialog WHERE matomoid=? HAVING max(timestamp) is not null;");
PreparedStatement st = ConnectDB
.getConnection()
.prepareStatement(
"SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() +
".lareferencialog WHERE matomoid=? HAVING max(timestamp) is not null");
st.setInt(1, laReferencialMatomoID);
ResultSet rs_date = st.executeQuery();
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") && !rs_date.getString(1).equals("")) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
}
}
rs_date.close();
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) {
log.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + sdf.format(date));
log
.info(
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
+ sdf.format(date));
String period = "&period=day&date=" + sdf.format(date);
String outFolder = "";
@ -163,14 +182,19 @@ public class LaReferenciaDownloadLogs {
FileSystem fs = FileSystem.get(new Configuration());
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String content = "";
int i = 0;
while (!content.equals("[]\n")) {
FSDataOutputStream fin = fs.create(new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + "_" + i + ".json"), true);
FSDataOutputStream fin = fs
.create(
new Path(outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + "_"
+ i + ".json"),
true);
String apiUrl = baseApiUrl;
if (i > 0) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.oa.graph.usagestats.export;
import java.io.*;
@ -13,9 +14,9 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.log4j.Logger;
import org.json.simple.JSONArray;
@ -39,14 +40,8 @@ public class LaReferenciaStats {
}
/*
private void connectDB() throws Exception {
try {
ConnectDB connectDB = new ConnectDB();
} catch (Exception e) {
log.error("Connect to db failed: " + e);
throw new Exception("Failed to connect to db: " + e.toString(), e);
}
}
* private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) {
* log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } }
*/
private void createTables() throws Exception {
try {
@ -126,7 +121,10 @@ public class LaReferenciaStats {
// File dir = new File(this.logRepoPath);
// File[] jsonFiles = dir.listFiles();
PreparedStatement prepStatem = ConnectDB.getConnection().prepareStatement("INSERT INTO lareferencialogtmp (matomoid, source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?,?)");
PreparedStatement prepStatem = ConnectDB
.getConnection()
.prepareStatement(
"INSERT INTO lareferencialogtmp (matomoid, source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?,?)");
int batch_size = 0;
JSONParser parser = new JSONParser();
@ -149,10 +147,15 @@ public class LaReferenciaStats {
if (actionDetailsObj.get("customVariables") != null) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
Timestamp timestamp = new Timestamp(Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
Timestamp timestamp = new Timestamp(
Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
String url = actionDetailsObj.get("url").toString();
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1")).get("customVariablePageValue1").toString();
String opendoar = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("2")).get("customVariablePageValue2").toString();
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1"))
.get("customVariablePageValue1")
.toString();
String opendoar = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("2"))
.get("customVariablePageValue2")
.toString();
String action = actionDetailsObj.get("type").toString();
prepStatem.setInt(1, idSite);
prepStatem.setString(2, "opendoar____::" + opendoar);
@ -217,11 +220,17 @@ public class LaReferenciaStats {
Statement stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
//String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as views, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM lareferencialog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
// String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as
// views, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS
// VARCHAR), 2, '0') AS month, source FROM lareferencialog where action='action' and (source_item_type='oaItem'
// or source_item_type='repItem') group by id, month, source order by source, id, month;";
String sql = "CREATE OR REPLACE VIEW la_result_views_monthly_tmp AS SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM lareferencialogtmp where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
stmt.executeUpdate(sql);
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire INTO views_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date,
// max(views) AS count, max(openaire_referrer) AS openaire INTO views_stats FROM result_views_monthly p,
// datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by
// repository_id, result_id, date ORDER BY repository_id, result_id, date;";
sql = "CREATE TABLE IF NOT EXISTS la_views_stats_tmp AS SELECT 'LaReferencia'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire FROM la_result_views_monthly_tmp p, public.datasource_oids d, public.result_oids ro where p.source=d.orid and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
stmt.executeUpdate(sql);
@ -235,11 +244,18 @@ public class LaReferenciaStats {
Statement stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
//String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as downloads, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM lareferencialog where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
// String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as
// downloads, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS
// VARCHAR), 2, '0') AS month, source FROM lareferencialog where action='download' and
// (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id,
// month;";
String sql = "CREATE OR REPLACE VIEW la_result_downloads_monthly_tmp as select entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM lareferencialogtmp where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
stmt.executeUpdate(sql);
//sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date,
// max(downloads) AS count INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro
// where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY
// repository_id, result_id, date;";
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
sql = "CREATE TABLE IF NOT EXISTS la_downloads_stats_tmp AS SELECT 'LaReferencia'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire FROM la_result_downloads_monthly_tmp p, public.datasource_oids d, public.result_oids ro where p.source=d.orid and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
stmt.executeUpdate(sql);
@ -249,7 +265,6 @@ public class LaReferenciaStats {
ConnectDB.getConnection().close();
}
private void updateProdTables() throws SQLException, Exception {
Statement stmt = ConnectDB.getConnection().createStatement();

View File

@ -57,21 +57,22 @@ public class UsageStatsExporter {
// piwikstatsdb.processLogs();
log.info("process logs done");
// LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(lareferenciaBaseURL,lareferenciaAuthToken);
// lrf.GetLaReferenciaRepos(lareferenciaLogPath);
System.out.println("====> Creating LaReferencia tables");
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(lareferenciaBaseURL, lareferenciaAuthToken);
lrf.GetLaReferenciaRepos(lareferenciaLogPath);
// LaReferenciaStats lastats = new LaReferenciaStats(lareferenciaLogPath);
// lastats.processLogs();
// log.info("LaReferencia logs done");
IrusStats irusstats = new IrusStats(irusUKBaseURL);
// IrusStats irusstats = new IrusStats(irusUKBaseURL);
// irusstats.getIrusRRReport(irusUKReportPath);
// irusstats.processIrusStats();
// log.info("irus done");
SarcStats sarcStats = new SarcStats();
// SarcStats sarcStats = new SarcStats();
// sarcStats.getAndProcessSarc(sarcsReportPathArray, sarcsReportPathNonArray);
sarcStats.finalizeSarcStats();
// sarcStats.finalizeSarcStats();
// log.info("sarc done");
// // finalize usagestats