diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java index 834539d2d..6bb349062 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java @@ -36,9 +36,13 @@ public class ExecuteWorkflow { static String dbImpalaUrl; static String usageStatsDBSchema; static String statsDBSchema; + static boolean recreateDbAndTables; static boolean downloadLogs; static Calendar startingLogPeriod; static Calendar endingLogPeriod; + static int numberOfPiwikIdsToDownload; + static int numberOfSiteIdsToDownload; + static boolean processPiwikLogs; public static void main(String args[]) throws Exception { @@ -72,11 +76,21 @@ public class ExecuteWorkflow { usageStatsDBSchema = parser.get("usageStatsDBSchema"); statsDBSchema = parser.get("statsDBSchema"); + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) + recreateDbAndTables = true; + else + recreateDbAndTables = false; + if (parser.get("downloadLogs").toLowerCase().equals("true")) downloadLogs = true; else downloadLogs = false; + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) + processPiwikLogs = true; + else + processPiwikLogs = false; + String startingLogPeriodStr = parser.get("startingLogPeriod"); Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); @@ -85,6 +99,9 @@ public class ExecuteWorkflow { Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); + numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java index c7d5014db..a3ef14f20 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java @@ -8,8 +8,10 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.Statement; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Calendar; import java.util.Date; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -135,13 +137,30 @@ public class LaReferenciaDownloadLogs { String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; String content = ""; + List siteIdToVisit = new ArrayList(); + + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds content = getJson(baseApiUrl); JSONParser parser = new JSONParser(); JSONArray jsonArray = (JSONArray) parser.parse(content); for (Object aJsonArray : jsonArray) { JSONObject jsonObjectRow = (JSONObject) aJsonArray; - int idSite = Integer.parseInt(jsonObjectRow.get("idsite").toString()); - this.GetLaReFerenciaLogs(repoLogsPath, idSite); + siteIdToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && + ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdToVisit = siteIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins siteIds: " + siteIdToVisit); + + for (int siteId : siteIdToVisit) { + logger.info("Now working on piwikId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); } } @@ -150,17 +169,17 @@ public class LaReferenciaDownloadLogs { logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - - Calendar start = Calendar.getInstance(); - start.set(Calendar.YEAR, 2020); - start.set(Calendar.MONTH, Calendar.JANUARY); - start.set(Calendar.DAY_OF_MONTH, 1); - - Calendar end = Calendar.getInstance(); - end.add(Calendar.DAY_OF_MONTH, -1); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = ExecuteWorkflow.startingLogPeriod; + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = ExecuteWorkflow.endingLogPeriod; + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Starting period for log download: " + sdf.format(end.getTime())); + PreparedStatement st = ConnectDB .getHiveConnection() .prepareStatement( @@ -177,7 +196,8 @@ public class LaReferenciaDownloadLogs { } rs_date.close(); - for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) { + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); logger .info( "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java index 84d7f8c39..abe99fc8d 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java @@ -96,10 +96,19 @@ public class PiwikDownloadLogs { "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); - // Getting all the piwikids in a list for logging reasons + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids List piwikIdToVisit = new ArrayList(); while (rs.next()) piwikIdToVisit.add(rs.getInt(1)); + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && + ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); // Setting the starting period diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java index 4abb1d1d9..a8a965866 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikStatsDB.java @@ -44,6 +44,10 @@ public class PiwikStatsDB { public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { this.logRepoPath = logRepoPath; this.logPortalPath = logPortalPath; + + } + + public void recreateDBAndTables() throws Exception { this.createDatabase(); this.createTables(); // The piwiklog table is not needed since it is built @@ -51,14 +55,6 @@ public class PiwikStatsDB { this.createTmpTables(); } - public void foo() { - Stream s = Arrays.stream(new String[] { - "a", "b", "c", "d" - }); - - System.out.println(s.parallel().count()); - } - public ArrayList getRobotsList() { return robotsList; } @@ -184,36 +180,35 @@ public class PiwikStatsDB { this.robotsList = counterRobots.getRobotsPatterns(); logger.info("Processing repository logs"); -// processRepositoryLog(); + processRepositoryLog(); logger.info("Repository logs process done"); logger.info("Removing double clicks"); -// removeDoubleClicks(); + removeDoubleClicks(); logger.info("Removing double clicks done"); logger.info("Cleaning oai"); -// cleanOAI(); + cleanOAI(); logger.info("Cleaning oai done"); logger.info("ViewsStats processing starts"); -// viewsStats(); + viewsStats(); logger.info("ViewsStats processing ends"); logger.info("DownloadsStats processing starts"); -// downloadsStats(); + downloadsStats(); logger.info("DownloadsStats processing starts"); logger.info("Processing portal logs"); -// processPortalLog(); + processPortalLog(); logger.info("Portal logs process done"); logger.info("Processing portal usagestats"); - // To see why this never ends portalStats(); logger.info("Portal usagestats process done"); logger.info("Updating Production Tables"); -// updateProdTables(); + updateProdTables(); logger.info("Updated Production Tables"); } catch (Exception e) { diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index 473bcc3fd..5eddaf450 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -98,46 +98,51 @@ public class UsageStatsExporter { // runImpalaQuery(); - // Create DB tables - they are also needed to download the statistics too - logger.info("Creating database and tables"); PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); - logger.info("Recreating log directories"); - reCreateLogDirs(); + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) + piwikstatsdb.recreateDBAndTables(); + ; -// // Download the statistics - The following 2 lines are not needed after the download - Commenting them out for -// // the moment logger.info("Initializing the download logs module"); PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); - logger.info("Downloading piwik logs"); - if (ExecuteWorkflow.downloadLogs) + + // Downloading piwik logs (also managing directory creation) + if (ExecuteWorkflow.downloadLogs) { + logger.info("Recreating log directories"); + reCreateLogDirs(); + + logger.info("Downloading piwik logs"); piwd .GetOpenAIRELogs( ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); + } logger.info("Downloaded piwik logs"); - System.exit(0); - // Create DB tables, insert/update statistics -// String cRobotsUrl = properties.getProperty("COUNTER_robots_Url"); String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; piwikstatsdb.setCounterRobotsURL(cRobotsUrl); - logger.info("Processing logs"); - piwikstatsdb.processLogs(); -// log.info("process logs done"); + + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } logger.info("Creating LaReferencia tables"); LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, ExecuteWorkflow.lareferenciaAuthToken); logger.info("Downloading LaReferencia logs"); -// lrf.GetLaReferenciaRepos(lareferenciaLogPath); + lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); logger.info("Downloaded LaReferencia logs"); LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); logger.info("Processing LaReferencia logs"); // lastats.processLogs(); // log.info("LaReferencia logs done"); + System.exit(0); + // IrusStats irusstats = new IrusStats(irusUKBaseURL); // irusstats.getIrusRRReport(irusUKReportPath); diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json index b5c28ca1f..56d5316be 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json @@ -95,6 +95,18 @@ "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, { "paramName": "dl", "paramLongName": "downloadLogs", @@ -112,5 +124,18 @@ "paramLongName": "endingLogPeriod", "paramDescription": "Ending log period", "paramRequired": true + }, + { + "paramName": "npidd", + "paramLongName": "numberOfPiwikIdsToDownload", + "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", + "paramRequired": true + }, + { + "paramName": "nsidd", + "paramLongName": "numberOfSiteIdsToDownload", + "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", + "paramRequired": true } ] + diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml index 8d281fd62..39c3641b4 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml @@ -58,9 +58,13 @@ --dbImpalaUrl${impalaJdbcUrl} --usageStatsDBSchema${usageStatsDBSchema} --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} --downloadLogs${downloadLogs} --startingLogPeriod${startingLogPeriod} --endingLogPeriod${endingLogPeriod} + --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} + --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} + --processPiwikLogs${processPiwikLogs}