More control parameters and limits on the lareferncia download

This commit is contained in:
Spyros Zoupanos 2020-10-04 17:03:01 +03:00
parent 2b330dd84c
commit 48d6bf28eb
7 changed files with 120 additions and 45 deletions

View File

@ -36,9 +36,13 @@ public class ExecuteWorkflow {
static String dbImpalaUrl; static String dbImpalaUrl;
static String usageStatsDBSchema; static String usageStatsDBSchema;
static String statsDBSchema; static String statsDBSchema;
static boolean recreateDbAndTables;
static boolean downloadLogs; static boolean downloadLogs;
static Calendar startingLogPeriod; static Calendar startingLogPeriod;
static Calendar endingLogPeriod; static Calendar endingLogPeriod;
static int numberOfPiwikIdsToDownload;
static int numberOfSiteIdsToDownload;
static boolean processPiwikLogs;
public static void main(String args[]) throws Exception { public static void main(String args[]) throws Exception {
@ -72,11 +76,21 @@ public class ExecuteWorkflow {
usageStatsDBSchema = parser.get("usageStatsDBSchema"); usageStatsDBSchema = parser.get("usageStatsDBSchema");
statsDBSchema = parser.get("statsDBSchema"); statsDBSchema = parser.get("statsDBSchema");
if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
recreateDbAndTables = true;
else
recreateDbAndTables = false;
if (parser.get("downloadLogs").toLowerCase().equals("true")) if (parser.get("downloadLogs").toLowerCase().equals("true"))
downloadLogs = true; downloadLogs = true;
else else
downloadLogs = false; downloadLogs = false;
if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
processPiwikLogs = true;
else
processPiwikLogs = false;
String startingLogPeriodStr = parser.get("startingLogPeriod"); String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
@ -85,6 +99,9 @@ public class ExecuteWorkflow {
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
UsageStatsExporter usagestatsExport = new UsageStatsExporter(); UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export(); usagestatsExport.export();
} }

View File

@ -8,8 +8,10 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.Statement; import java.sql.Statement;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar; import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
@ -135,13 +137,30 @@ public class LaReferenciaDownloadLogs {
String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
String content = ""; String content = "";
List<Integer> siteIdToVisit = new ArrayList<Integer>();
// Getting all the siteIds in a list for logging reasons & limiting the list
// to the max number of siteIds
content = getJson(baseApiUrl); content = getJson(baseApiUrl);
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
JSONArray jsonArray = (JSONArray) parser.parse(content); JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) { for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray; JSONObject jsonObjectRow = (JSONObject) aJsonArray;
int idSite = Integer.parseInt(jsonObjectRow.get("idsite").toString()); siteIdToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
this.GetLaReFerenciaLogs(repoLogsPath, idSite); }
logger.info("Found the following siteIds for download: " + siteIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdToVisit.size()) {
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
siteIdToVisit = siteIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
}
logger.info("Downloading from repos with the followins siteIds: " + siteIdToVisit);
for (int siteId : siteIdToVisit) {
logger.info("Now working on piwikId: " + siteId);
this.GetLaReFerenciaLogs(repoLogsPath, siteId);
} }
} }
@ -150,17 +169,17 @@ public class LaReferenciaDownloadLogs {
logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2020);
start.set(Calendar.MONTH, Calendar.JANUARY);
start.set(Calendar.DAY_OF_MONTH, 1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// Setting the starting period
Calendar start = ExecuteWorkflow.startingLogPeriod;
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = ExecuteWorkflow.endingLogPeriod;
end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Starting period for log download: " + sdf.format(end.getTime()));
PreparedStatement st = ConnectDB PreparedStatement st = ConnectDB
.getHiveConnection() .getHiveConnection()
.prepareStatement( .prepareStatement(
@ -177,7 +196,8 @@ public class LaReferenciaDownloadLogs {
} }
rs_date.close(); rs_date.close();
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) { for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
Date date = currDay.getTime();
logger logger
.info( .info(
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "

View File

@ -96,10 +96,19 @@ public class PiwikDownloadLogs {
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
// Getting all the piwikids in a list for logging reasons // Getting all the piwikids in a list for logging reasons & limitting the list
// to the max number of piwikids
List<Integer> piwikIdToVisit = new ArrayList<Integer>(); List<Integer> piwikIdToVisit = new ArrayList<Integer>();
while (rs.next()) while (rs.next())
piwikIdToVisit.add(rs.getInt(1)); piwikIdToVisit.add(rs.getInt(1));
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
}
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
// Setting the starting period // Setting the starting period

View File

@ -44,6 +44,10 @@ public class PiwikStatsDB {
public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
this.logRepoPath = logRepoPath; this.logRepoPath = logRepoPath;
this.logPortalPath = logPortalPath; this.logPortalPath = logPortalPath;
}
public void recreateDBAndTables() throws Exception {
this.createDatabase(); this.createDatabase();
this.createTables(); this.createTables();
// The piwiklog table is not needed since it is built // The piwiklog table is not needed since it is built
@ -51,14 +55,6 @@ public class PiwikStatsDB {
this.createTmpTables(); this.createTmpTables();
} }
public void foo() {
Stream<String> s = Arrays.stream(new String[] {
"a", "b", "c", "d"
});
System.out.println(s.parallel().count());
}
public ArrayList getRobotsList() { public ArrayList getRobotsList() {
return robotsList; return robotsList;
} }
@ -184,36 +180,35 @@ public class PiwikStatsDB {
this.robotsList = counterRobots.getRobotsPatterns(); this.robotsList = counterRobots.getRobotsPatterns();
logger.info("Processing repository logs"); logger.info("Processing repository logs");
// processRepositoryLog(); processRepositoryLog();
logger.info("Repository logs process done"); logger.info("Repository logs process done");
logger.info("Removing double clicks"); logger.info("Removing double clicks");
// removeDoubleClicks(); removeDoubleClicks();
logger.info("Removing double clicks done"); logger.info("Removing double clicks done");
logger.info("Cleaning oai"); logger.info("Cleaning oai");
// cleanOAI(); cleanOAI();
logger.info("Cleaning oai done"); logger.info("Cleaning oai done");
logger.info("ViewsStats processing starts"); logger.info("ViewsStats processing starts");
// viewsStats(); viewsStats();
logger.info("ViewsStats processing ends"); logger.info("ViewsStats processing ends");
logger.info("DownloadsStats processing starts"); logger.info("DownloadsStats processing starts");
// downloadsStats(); downloadsStats();
logger.info("DownloadsStats processing starts"); logger.info("DownloadsStats processing starts");
logger.info("Processing portal logs"); logger.info("Processing portal logs");
// processPortalLog(); processPortalLog();
logger.info("Portal logs process done"); logger.info("Portal logs process done");
logger.info("Processing portal usagestats"); logger.info("Processing portal usagestats");
// To see why this never ends
portalStats(); portalStats();
logger.info("Portal usagestats process done"); logger.info("Portal usagestats process done");
logger.info("Updating Production Tables"); logger.info("Updating Production Tables");
// updateProdTables(); updateProdTables();
logger.info("Updated Production Tables"); logger.info("Updated Production Tables");
} catch (Exception e) { } catch (Exception e) {

View File

@ -98,46 +98,51 @@ public class UsageStatsExporter {
// runImpalaQuery(); // runImpalaQuery();
// Create DB tables - they are also needed to download the statistics too
logger.info("Creating database and tables");
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Recreating log directories"); logger.info("Re-creating database and tables");
reCreateLogDirs(); if (ExecuteWorkflow.recreateDbAndTables)
piwikstatsdb.recreateDBAndTables();
;
// // Download the statistics - The following 2 lines are not needed after the download - Commenting them out for
// // the moment
logger.info("Initializing the download logs module"); logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
logger.info("Downloading piwik logs");
if (ExecuteWorkflow.downloadLogs) // Downloading piwik logs (also managing directory creation)
if (ExecuteWorkflow.downloadLogs) {
logger.info("Recreating log directories");
reCreateLogDirs();
logger.info("Downloading piwik logs");
piwd piwd
.GetOpenAIRELogs( .GetOpenAIRELogs(
ExecuteWorkflow.repoLogPath, ExecuteWorkflow.repoLogPath,
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
}
logger.info("Downloaded piwik logs"); logger.info("Downloaded piwik logs");
System.exit(0);
// Create DB tables, insert/update statistics // Create DB tables, insert/update statistics
// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
piwikstatsdb.setCounterRobotsURL(cRobotsUrl); piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
logger.info("Processing logs");
piwikstatsdb.processLogs(); if (ExecuteWorkflow.processPiwikLogs) {
// log.info("process logs done"); logger.info("Processing logs");
piwikstatsdb.processLogs();
}
logger.info("Creating LaReferencia tables"); logger.info("Creating LaReferencia tables");
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
ExecuteWorkflow.lareferenciaAuthToken); ExecuteWorkflow.lareferenciaAuthToken);
logger.info("Downloading LaReferencia logs"); logger.info("Downloading LaReferencia logs");
// lrf.GetLaReferenciaRepos(lareferenciaLogPath); lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs"); logger.info("Downloaded LaReferencia logs");
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Processing LaReferencia logs"); logger.info("Processing LaReferencia logs");
// lastats.processLogs(); // lastats.processLogs();
// log.info("LaReferencia logs done"); // log.info("LaReferencia logs done");
System.exit(0);
// IrusStats irusstats = new IrusStats(irusUKBaseURL); // IrusStats irusstats = new IrusStats(irusUKBaseURL);
// irusstats.getIrusRRReport(irusUKReportPath); // irusstats.getIrusRRReport(irusUKReportPath);

View File

@ -95,6 +95,18 @@
"paramDescription": "activate tranform-only mode. Only apply transformation step", "paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "rdbt",
"paramLongName": "recreateDbAndTables",
"paramDescription": "Re-create database and initial tables?",
"paramRequired": true
},
{
"paramName": "ppwl",
"paramLongName": "processPiwikLogs",
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true
},
{ {
"paramName": "dl", "paramName": "dl",
"paramLongName": "downloadLogs", "paramLongName": "downloadLogs",
@ -112,5 +124,18 @@
"paramLongName": "endingLogPeriod", "paramLongName": "endingLogPeriod",
"paramDescription": "Ending log period", "paramDescription": "Ending log period",
"paramRequired": true "paramRequired": true
},
{
"paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload",
"paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
"paramRequired": true
},
{
"paramName": "nsidd",
"paramLongName": "numberOfSiteIdsToDownload",
"paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
"paramRequired": true
} }
] ]

View File

@ -58,9 +58,13 @@
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg> <arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg> <arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg> <arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
<arg>--downloadLogs</arg><arg>${downloadLogs}</arg> <arg>--downloadLogs</arg><arg>${downloadLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg> <arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg> <arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<capture-output/> <capture-output/>
</java> </java>
<ok to="End" /> <ok to="End" />