forked from D-Net/dnet-hadoop
More control parameters and limits on the lareferncia download
This commit is contained in:
parent
2b330dd84c
commit
48d6bf28eb
|
@ -36,9 +36,13 @@ public class ExecuteWorkflow {
|
|||
static String dbImpalaUrl;
|
||||
static String usageStatsDBSchema;
|
||||
static String statsDBSchema;
|
||||
static boolean recreateDbAndTables;
|
||||
static boolean downloadLogs;
|
||||
static Calendar startingLogPeriod;
|
||||
static Calendar endingLogPeriod;
|
||||
static int numberOfPiwikIdsToDownload;
|
||||
static int numberOfSiteIdsToDownload;
|
||||
static boolean processPiwikLogs;
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
|
@ -72,11 +76,21 @@ public class ExecuteWorkflow {
|
|||
usageStatsDBSchema = parser.get("usageStatsDBSchema");
|
||||
statsDBSchema = parser.get("statsDBSchema");
|
||||
|
||||
if (parser.get("recreateDbAndTables").toLowerCase().equals("true"))
|
||||
recreateDbAndTables = true;
|
||||
else
|
||||
recreateDbAndTables = false;
|
||||
|
||||
if (parser.get("downloadLogs").toLowerCase().equals("true"))
|
||||
downloadLogs = true;
|
||||
else
|
||||
downloadLogs = false;
|
||||
|
||||
if (parser.get("processPiwikLogs").toLowerCase().equals("true"))
|
||||
processPiwikLogs = true;
|
||||
else
|
||||
processPiwikLogs = false;
|
||||
|
||||
String startingLogPeriodStr = parser.get("startingLogPeriod");
|
||||
Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
|
||||
startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
|
||||
|
@ -85,6 +99,9 @@ public class ExecuteWorkflow {
|
|||
Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
|
||||
endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
|
||||
|
||||
numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
|
||||
numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
|
||||
|
||||
UsageStatsExporter usagestatsExport = new UsageStatsExporter();
|
||||
usagestatsExport.export();
|
||||
}
|
||||
|
|
|
@ -8,8 +8,10 @@ import java.sql.PreparedStatement;
|
|||
import java.sql.ResultSet;
|
||||
import java.sql.Statement;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
|
@ -135,13 +137,30 @@ public class LaReferenciaDownloadLogs {
|
|||
String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
|
||||
String content = "";
|
||||
|
||||
List<Integer> siteIdToVisit = new ArrayList<Integer>();
|
||||
|
||||
// Getting all the siteIds in a list for logging reasons & limiting the list
|
||||
// to the max number of siteIds
|
||||
content = getJson(baseApiUrl);
|
||||
JSONParser parser = new JSONParser();
|
||||
JSONArray jsonArray = (JSONArray) parser.parse(content);
|
||||
for (Object aJsonArray : jsonArray) {
|
||||
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
|
||||
int idSite = Integer.parseInt(jsonObjectRow.get("idsite").toString());
|
||||
this.GetLaReFerenciaLogs(repoLogsPath, idSite);
|
||||
siteIdToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
|
||||
}
|
||||
logger.info("Found the following siteIds for download: " + siteIdToVisit);
|
||||
|
||||
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
|
||||
ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdToVisit.size()) {
|
||||
logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
||||
siteIdToVisit = siteIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
||||
}
|
||||
|
||||
logger.info("Downloading from repos with the followins siteIds: " + siteIdToVisit);
|
||||
|
||||
for (int siteId : siteIdToVisit) {
|
||||
logger.info("Now working on piwikId: " + siteId);
|
||||
this.GetLaReFerenciaLogs(repoLogsPath, siteId);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,17 +169,17 @@ public class LaReferenciaDownloadLogs {
|
|||
|
||||
logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
|
||||
|
||||
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
|
||||
|
||||
Calendar start = Calendar.getInstance();
|
||||
start.set(Calendar.YEAR, 2020);
|
||||
start.set(Calendar.MONTH, Calendar.JANUARY);
|
||||
start.set(Calendar.DAY_OF_MONTH, 1);
|
||||
|
||||
Calendar end = Calendar.getInstance();
|
||||
end.add(Calendar.DAY_OF_MONTH, -1);
|
||||
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
// Setting the starting period
|
||||
Calendar start = ExecuteWorkflow.startingLogPeriod;
|
||||
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
|
||||
|
||||
// Setting the ending period (last day of the month)
|
||||
Calendar end = ExecuteWorkflow.endingLogPeriod;
|
||||
end.add(Calendar.MONTH, +1);
|
||||
end.add(Calendar.DAY_OF_MONTH, -1);
|
||||
logger.info("Starting period for log download: " + sdf.format(end.getTime()));
|
||||
|
||||
PreparedStatement st = ConnectDB
|
||||
.getHiveConnection()
|
||||
.prepareStatement(
|
||||
|
@ -177,7 +196,8 @@ public class LaReferenciaDownloadLogs {
|
|||
}
|
||||
rs_date.close();
|
||||
|
||||
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) {
|
||||
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
|
||||
Date date = currDay.getTime();
|
||||
logger
|
||||
.info(
|
||||
"Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
|
||||
|
|
|
@ -96,10 +96,19 @@ public class PiwikDownloadLogs {
|
|||
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
|
||||
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
|
||||
|
||||
// Getting all the piwikids in a list for logging reasons
|
||||
// Getting all the piwikids in a list for logging reasons & limitting the list
|
||||
// to the max number of piwikids
|
||||
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
|
||||
while (rs.next())
|
||||
piwikIdToVisit.add(rs.getInt(1));
|
||||
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
|
||||
|
||||
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
|
||||
ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
|
||||
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
||||
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
||||
}
|
||||
|
||||
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
|
||||
|
||||
// Setting the starting period
|
||||
|
|
|
@ -44,6 +44,10 @@ public class PiwikStatsDB {
|
|||
public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
|
||||
this.logRepoPath = logRepoPath;
|
||||
this.logPortalPath = logPortalPath;
|
||||
|
||||
}
|
||||
|
||||
public void recreateDBAndTables() throws Exception {
|
||||
this.createDatabase();
|
||||
this.createTables();
|
||||
// The piwiklog table is not needed since it is built
|
||||
|
@ -51,14 +55,6 @@ public class PiwikStatsDB {
|
|||
this.createTmpTables();
|
||||
}
|
||||
|
||||
public void foo() {
|
||||
Stream<String> s = Arrays.stream(new String[] {
|
||||
"a", "b", "c", "d"
|
||||
});
|
||||
|
||||
System.out.println(s.parallel().count());
|
||||
}
|
||||
|
||||
public ArrayList getRobotsList() {
|
||||
return robotsList;
|
||||
}
|
||||
|
@ -184,36 +180,35 @@ public class PiwikStatsDB {
|
|||
this.robotsList = counterRobots.getRobotsPatterns();
|
||||
|
||||
logger.info("Processing repository logs");
|
||||
// processRepositoryLog();
|
||||
processRepositoryLog();
|
||||
logger.info("Repository logs process done");
|
||||
|
||||
logger.info("Removing double clicks");
|
||||
// removeDoubleClicks();
|
||||
removeDoubleClicks();
|
||||
logger.info("Removing double clicks done");
|
||||
|
||||
logger.info("Cleaning oai");
|
||||
// cleanOAI();
|
||||
cleanOAI();
|
||||
logger.info("Cleaning oai done");
|
||||
|
||||
logger.info("ViewsStats processing starts");
|
||||
// viewsStats();
|
||||
viewsStats();
|
||||
logger.info("ViewsStats processing ends");
|
||||
|
||||
logger.info("DownloadsStats processing starts");
|
||||
// downloadsStats();
|
||||
downloadsStats();
|
||||
logger.info("DownloadsStats processing starts");
|
||||
|
||||
logger.info("Processing portal logs");
|
||||
// processPortalLog();
|
||||
processPortalLog();
|
||||
logger.info("Portal logs process done");
|
||||
|
||||
logger.info("Processing portal usagestats");
|
||||
// To see why this never ends
|
||||
portalStats();
|
||||
logger.info("Portal usagestats process done");
|
||||
|
||||
logger.info("Updating Production Tables");
|
||||
// updateProdTables();
|
||||
updateProdTables();
|
||||
logger.info("Updated Production Tables");
|
||||
|
||||
} catch (Exception e) {
|
||||
|
|
|
@ -98,46 +98,51 @@ public class UsageStatsExporter {
|
|||
|
||||
// runImpalaQuery();
|
||||
|
||||
// Create DB tables - they are also needed to download the statistics too
|
||||
logger.info("Creating database and tables");
|
||||
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
|
||||
|
||||
logger.info("Recreating log directories");
|
||||
reCreateLogDirs();
|
||||
logger.info("Re-creating database and tables");
|
||||
if (ExecuteWorkflow.recreateDbAndTables)
|
||||
piwikstatsdb.recreateDBAndTables();
|
||||
;
|
||||
|
||||
// // Download the statistics - The following 2 lines are not needed after the download - Commenting them out for
|
||||
// // the moment
|
||||
logger.info("Initializing the download logs module");
|
||||
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
|
||||
logger.info("Downloading piwik logs");
|
||||
if (ExecuteWorkflow.downloadLogs)
|
||||
|
||||
// Downloading piwik logs (also managing directory creation)
|
||||
if (ExecuteWorkflow.downloadLogs) {
|
||||
logger.info("Recreating log directories");
|
||||
reCreateLogDirs();
|
||||
|
||||
logger.info("Downloading piwik logs");
|
||||
piwd
|
||||
.GetOpenAIRELogs(
|
||||
ExecuteWorkflow.repoLogPath,
|
||||
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
|
||||
}
|
||||
logger.info("Downloaded piwik logs");
|
||||
|
||||
System.exit(0);
|
||||
|
||||
// Create DB tables, insert/update statistics
|
||||
// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url");
|
||||
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
|
||||
piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
|
||||
logger.info("Processing logs");
|
||||
piwikstatsdb.processLogs();
|
||||
// log.info("process logs done");
|
||||
|
||||
if (ExecuteWorkflow.processPiwikLogs) {
|
||||
logger.info("Processing logs");
|
||||
piwikstatsdb.processLogs();
|
||||
}
|
||||
|
||||
logger.info("Creating LaReferencia tables");
|
||||
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
|
||||
ExecuteWorkflow.lareferenciaAuthToken);
|
||||
logger.info("Downloading LaReferencia logs");
|
||||
// lrf.GetLaReferenciaRepos(lareferenciaLogPath);
|
||||
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
|
||||
logger.info("Downloaded LaReferencia logs");
|
||||
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
|
||||
logger.info("Processing LaReferencia logs");
|
||||
// lastats.processLogs();
|
||||
// log.info("LaReferencia logs done");
|
||||
|
||||
System.exit(0);
|
||||
|
||||
// IrusStats irusstats = new IrusStats(irusUKBaseURL);
|
||||
// irusstats.getIrusRRReport(irusUKReportPath);
|
||||
|
||||
|
|
|
@ -95,6 +95,18 @@
|
|||
"paramDescription": "activate tranform-only mode. Only apply transformation step",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rdbt",
|
||||
"paramLongName": "recreateDbAndTables",
|
||||
"paramDescription": "Re-create database and initial tables?",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ppwl",
|
||||
"paramLongName": "processPiwikLogs",
|
||||
"paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dl",
|
||||
"paramLongName": "downloadLogs",
|
||||
|
@ -112,5 +124,18 @@
|
|||
"paramLongName": "endingLogPeriod",
|
||||
"paramDescription": "Ending log period",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "npidd",
|
||||
"paramLongName": "numberOfPiwikIdsToDownload",
|
||||
"paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nsidd",
|
||||
"paramLongName": "numberOfSiteIdsToDownload",
|
||||
"paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
|
|
@ -58,9 +58,13 @@
|
|||
<arg>--dbImpalaUrl</arg><arg>${impalaJdbcUrl}</arg>
|
||||
<arg>--usageStatsDBSchema</arg><arg>${usageStatsDBSchema}</arg>
|
||||
<arg>--statsDBSchema</arg><arg>${statsDBSchema}</arg>
|
||||
<arg>--recreateDbAndTables</arg><arg>${recreateDbAndTables}</arg>
|
||||
<arg>--downloadLogs</arg><arg>${downloadLogs}</arg>
|
||||
<arg>--startingLogPeriod</arg><arg>${startingLogPeriod}</arg>
|
||||
<arg>--endingLogPeriod</arg><arg>${endingLogPeriod}</arg>
|
||||
<arg>--numberOfPiwikIdsToDownload</arg><arg>${numberOfPiwikIdsToDownload}</arg>
|
||||
<arg>--numberOfSiteIdsToDownload</arg><arg>${numberOfSiteIdsToDownload}</arg>
|
||||
<arg>--processPiwikLogs</arg><arg>${processPiwikLogs}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="End" />
|
||||
|
|
Loading…
Reference in New Issue