More control parameters and limits on the lareferncia download

This commit is contained in:
Spyros Zoupanos 2020-10-04 17:03:01 +03:00
parent 2b330dd84c
commit 48d6bf28eb
7 changed files with 120 additions and 45 deletions

View File

@ -36,9 +36,13 @@ public class ExecuteWorkflow {
static String dbImpalaUrl;
static String usageStatsDBSchema;
static String statsDBSchema;
static boolean recreateDbAndTables;
static boolean downloadLogs;
static Calendar startingLogPeriod;
static Calendar endingLogPeriod;
static int numberOfPiwikIdsToDownload;
static int numberOfSiteIdsToDownload;
static boolean processPiwikLogs;
public static void main(String args[]) throws Exception {
@ -72,11 +76,21 @@ public class ExecuteWorkflow {
usageStatsDBSchema = parser.get("usageStatsDBSchema");
statsDBSchema = parser.get("statsDBSchema");
if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
recreateDbAndTables = true;
else
recreateDbAndTables = false;
if (parser.get("downloadLogs").toLowerCase().equals("true"))
downloadLogs = true;
else
downloadLogs = false;
if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
processPiwikLogs = true;
else
processPiwikLogs = false;
String startingLogPeriodStr = parser.get("startingLogPeriod");
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
@ -85,6 +99,9 @@ public class ExecuteWorkflow {
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
usagestatsExport.export();
}

View File

@ -8,8 +8,10 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
@ -135,13 +137,30 @@ public class LaReferenciaDownloadLogs {
String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
String content = "";
List<Integer> siteIdToVisit = new ArrayList<Integer>();
// Getting all the siteIds in a list for logging reasons & limiting the list
// to the max number of siteIds
content = getJson(baseApiUrl);
JSONParser parser = new JSONParser();
JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
int idSite = Integer.parseInt(jsonObjectRow.get("idsite").toString());
this.GetLaReFerenciaLogs(repoLogsPath, idSite);
siteIdToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
}
logger.info("Found the following siteIds for download: " + siteIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdToVisit.size()) {
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
siteIdToVisit = siteIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
}
logger.info("Downloading from repos with the followins siteIds: " + siteIdToVisit);
for (int siteId : siteIdToVisit) {
logger.info("Now working on piwikId: " + siteId);
this.GetLaReFerenciaLogs(repoLogsPath, siteId);
}
}
@ -150,17 +169,17 @@ public class LaReferenciaDownloadLogs {
logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2020);
start.set(Calendar.MONTH, Calendar.JANUARY);
start.set(Calendar.DAY_OF_MONTH, 1);
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// Setting the starting period
Calendar start = ExecuteWorkflow.startingLogPeriod;
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = ExecuteWorkflow.endingLogPeriod;
end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Starting period for log download: " + sdf.format(end.getTime()));
PreparedStatement st = ConnectDB
.getHiveConnection()
.prepareStatement(
@ -177,7 +196,8 @@ public class LaReferenciaDownloadLogs {
}
rs_date.close();
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) {
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
Date date = currDay.getTime();
logger
.info(
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "

View File

@ -96,10 +96,19 @@ public class PiwikDownloadLogs {
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
// Getting all the piwikids in a list for logging reasons
// Getting all the piwikids in a list for logging reasons & limitting the list
// to the max number of piwikids
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
while (rs.next())
piwikIdToVisit.add(rs.getInt(1));
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
}
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
// Setting the starting period

View File

@ -44,6 +44,10 @@ public class PiwikStatsDB {
public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
this.logRepoPath = logRepoPath;
this.logPortalPath = logPortalPath;
}
public void recreateDBAndTables() throws Exception {
this.createDatabase();
this.createTables();
// The piwiklog table is not needed since it is built
@ -51,14 +55,6 @@ public class PiwikStatsDB {
this.createTmpTables();
}
public void foo() {
Stream<String> s = Arrays.stream(new String[] {
"a", "b", "c", "d"
});
System.out.println(s.parallel().count());
}
public ArrayList getRobotsList() {
return robotsList;
}
@ -184,36 +180,35 @@ public class PiwikStatsDB {
this.robotsList = counterRobots.getRobotsPatterns();
logger.info("Processing repository logs");
// processRepositoryLog();
processRepositoryLog();
logger.info("Repository logs process done");
logger.info("Removing double clicks");
// removeDoubleClicks();
removeDoubleClicks();
logger.info("Removing double clicks done");
logger.info("Cleaning oai");
// cleanOAI();
cleanOAI();
logger.info("Cleaning oai done");
logger.info("ViewsStats processing starts");
// viewsStats();
viewsStats();
logger.info("ViewsStats processing ends");
logger.info("DownloadsStats processing starts");
// downloadsStats();
downloadsStats();
logger.info("DownloadsStats processing starts");
logger.info("Processing portal logs");
// processPortalLog();
processPortalLog();
logger.info("Portal logs process done");
logger.info("Processing portal usagestats");
// To see why this never ends
portalStats();
logger.info("Portal usagestats process done");
logger.info("Updating Production Tables");
// updateProdTables();
updateProdTables();
logger.info("Updated Production Tables");
} catch (Exception e) {

View File

@ -98,46 +98,51 @@ public class UsageStatsExporter {
// runImpalaQuery();
// Create DB tables - they are also needed to download the statistics too
logger.info("Creating database and tables");
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
logger.info("Recreating log directories");
reCreateLogDirs();
logger.info("Re-creating database and tables");
if (ExecuteWorkflow.recreateDbAndTables)
piwikstatsdb.recreateDBAndTables();
;
// // Download the statistics - The following 2 lines are not needed after the download - Commenting them out for
// // the moment
logger.info("Initializing the download logs module");
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
logger.info("Downloading piwik logs");
if (ExecuteWorkflow.downloadLogs)
// Downloading piwik logs (also managing directory creation)
if (ExecuteWorkflow.downloadLogs) {
logger.info("Recreating log directories");
reCreateLogDirs();
logger.info("Downloading piwik logs");
piwd
.GetOpenAIRELogs(
ExecuteWorkflow.repoLogPath,
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
}
logger.info("Downloaded piwik logs");
System.exit(0);
// Create DB tables, insert/update statistics
// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
logger.info("Processing logs");
piwikstatsdb.processLogs();
// log.info("process logs done");
if (ExecuteWorkflow.processPiwikLogs) {
logger.info("Processing logs");
piwikstatsdb.processLogs();
}
logger.info("Creating LaReferencia tables");
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
ExecuteWorkflow.lareferenciaAuthToken);
logger.info("Downloading LaReferencia logs");
// lrf.GetLaReferenciaRepos(lareferenciaLogPath);
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Downloaded LaReferencia logs");
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
logger.info("Processing LaReferencia logs");
// lastats.processLogs();
// log.info("LaReferencia logs done");
System.exit(0);
// IrusStats irusstats = new IrusStats(irusUKBaseURL);
// irusstats.getIrusRRReport(irusUKReportPath);

View File

@ -95,6 +95,18 @@
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
},
{
"paramName": "rdbt",
"paramLongName": "recreateDbAndTables",
"paramDescription": "Re-create database and initial tables?",
"paramRequired": true
},
{
"paramName": "ppwl",
"paramLongName": "processPiwikLogs",
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
"paramRequired": true
},
{
"paramName": "dl",
"paramLongName": "downloadLogs",
@ -112,5 +124,18 @@
"paramLongName": "endingLogPeriod",
"paramDescription": "Ending log period",
"paramRequired": true
},
{
"paramName": "npidd",
"paramLongName": "numberOfPiwikIdsToDownload",
"paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
"paramRequired": true
},
{
"paramName": "nsidd",
"paramLongName": "numberOfSiteIdsToDownload",
"paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
"paramRequired": true
}
]

View File

@ -58,9 +58,13 @@
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
<arg>--downloadLogs</arg><arg>${downloadLogs}</arg>
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
<capture-output/>
</java>
<ok to="End" />