From 8da64d8f54f43112e06e59c3f0ac4a8059878699 Mon Sep 17 00:00:00 2001 From: Spyros Zoupanos Date: Tue, 6 Oct 2020 23:44:25 +0300 Subject: [PATCH] Adding flags and time limits to Irus stats --- .../usagestats/export/ExecuteWorkflow.java | 19 +++++ .../oa/graph/usagestats/export/IrusStats.java | 74 +++++++++++++------ .../export/LaReferenciaDownloadLogs.java | 14 ++-- .../usagestats/export/UsageStatsExporter.java | 32 +++++--- .../export/usagestats_parameters.json | 25 ++++++- .../graph/usagestats/oozie_app/workflow.xml | 4 + 6 files changed, 126 insertions(+), 42 deletions(-) diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java index ddddc9f66..6b18140a7 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/ExecuteWorkflow.java @@ -48,6 +48,11 @@ public class ExecuteWorkflow { static boolean downloadLaReferenciaLogs; static boolean processLaReferenciaLogs; + static boolean irusCreateTablesEmptyDirs; + static boolean irusDownloadReports; + static boolean irusProcessStats; + static int irusNumberOfOpendoarsToDownload; + public static void main(String args[]) throws Exception { // Sending the logs to the console @@ -116,6 +121,20 @@ public class ExecuteWorkflow { else processLaReferenciaLogs = false; + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) + irusCreateTablesEmptyDirs = true; + else + irusCreateTablesEmptyDirs = false; + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) + irusDownloadReports = true; + else + irusDownloadReports = false; + if (parser.get("irusProcessStats").toLowerCase().equals("true")) + irusProcessStats = true; + else + irusProcessStats = false; + irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); usagestatsExport.export(); } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java index cbcf45102..7725f5b21 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/IrusStats.java @@ -8,8 +8,10 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.Statement; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Calendar; import java.util.Date; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -32,16 +34,22 @@ public class IrusStats { public IrusStats(String irusUKURL) throws Exception { this.irusUKURL = irusUKURL; - logger.info("Creating Irus Stats tables"); - createTables(); - logger.info("Created Irus Stats tables"); // The following may not be needed - It will be created when JSON tables are created // createTmpTables(); } - private void createTables() throws Exception { - try { + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } + + public void createTables() throws Exception { + try { logger.info("Creating sushilog"); Statement stmt = ConnectDB.getHiveConnection().createStatement(); String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() @@ -198,17 +206,26 @@ public class IrusStats { } public void getIrusRRReport(String irusUKReportPath) throws Exception { - SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=2016-01&EndDate=" - + simpleDateFormat.format(new Date()) - + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); + end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; logger.info("(processIrusRRReport) Getting report: " + reportUrl); String text = getJson(reportUrl, "", ""); -// log.info("Report: " + text); - + List opendoarsToVisit = new ArrayList(); JSONParser parser = new JSONParser(); JSONObject jsonObject = (JSONObject) parser.parse(text); jsonObject = (JSONObject) jsonObject.get("ReportResponse"); @@ -224,6 +241,7 @@ public class IrusStats { JSONObject opendoar = (JSONObject) identifier; if (opendoar.get("Type").toString().equals("OpenDOAR")) { i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); getIrusIRReport(opendoar.get("Value").toString(), irusUKReportPath); break; } @@ -231,7 +249,22 @@ public class IrusStats { // break; } - logger.info("(processIrusRRReport) Finished with report: " + reportUrl); + logger.info("Found the following opendoars for download: " + opendoarsToVisit); + + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 && + ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } + + logger.info("Downloading the followins opendoars: " + opendoarsToVisit); + + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on piwikId: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + + logger.info("Finished with report: " + reportUrl); } private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { @@ -242,13 +275,15 @@ public class IrusStats { SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); - Calendar start = Calendar.getInstance(); - start.set(Calendar.YEAR, 2016); - start.set(Calendar.MONTH, Calendar.JANUARY); - // start.setTime(simpleDateFormat.parse("2016-01")); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + simpleDateFormat.format(start.getTime())); - Calendar end = Calendar.getInstance(); + // Setting the ending period (last day of the month) + Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + end.add(Calendar.MONTH, +1); end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + simpleDateFormat.format(end.getTime())); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); PreparedStatement st = ConnectDB @@ -264,10 +299,6 @@ public class IrusStats { } } rs_date.close(); - PreparedStatement preparedStatement = ConnectDB - .getHiveConnection() - .prepareStatement( - "INSERT INTO sushilogtmp (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); int batch_size = 0; while (start.before(end)) { @@ -310,7 +341,6 @@ public class IrusStats { fin.close(); } - preparedStatement.executeBatch(); ConnectDB.getHiveConnection().close(); logger.info("(processIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java index 1b4742543..01562e22a 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/LaReferenciaDownloadLogs.java @@ -137,7 +137,7 @@ public class LaReferenciaDownloadLogs { String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; String content = ""; - List siteIdToVisit = new ArrayList(); + List siteIdsToVisit = new ArrayList(); // Getting all the siteIds in a list for logging reasons & limiting the list // to the max number of siteIds @@ -146,19 +146,19 @@ public class LaReferenciaDownloadLogs { JSONArray jsonArray = (JSONArray) parser.parse(content); for (Object aJsonArray : jsonArray) { JSONObject jsonObjectRow = (JSONObject) aJsonArray; - siteIdToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); } - logger.info("Found the following siteIds for download: " + siteIdToVisit); + logger.info("Found the following siteIds for download: " + siteIdsToVisit); if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && - ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdToVisit.size()) { + ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); - siteIdToVisit = siteIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); } - logger.info("Downloading from repos with the followins siteIds: " + siteIdToVisit); + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); - for (int siteId : siteIdToVisit) { + for (int siteId : siteIdsToVisit) { logger.info("Now working on piwikId: " + siteId); this.GetLaReFerenciaLogs(repoLogsPath, siteId); } diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index cd93459bb..4318f2cde 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -60,9 +60,6 @@ public class UsageStatsExporter { logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); - logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); - logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); @@ -78,9 +75,6 @@ public class UsageStatsExporter { logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); - logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); - dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); - logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); @@ -146,14 +140,28 @@ public class UsageStatsExporter { lastats.processLogs(); logger.info("LaReferencia logs done"); } + + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); + if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { + logger.info("Creating Irus Stats tables"); + irusstats.createTables(); + logger.info("Created Irus Stats tables"); + + logger.info("Re-create log dirs"); + irusstats.reCreateLogDirs(); + logger.info("Re-created log dirs"); + } + + if (ExecuteWorkflow.irusDownloadReports) { + irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); + } + if (ExecuteWorkflow.irusProcessStats) { + irusstats.processIrusStats(); + logger.info("Irus done"); + } + System.exit(0); -// IrusStats irusstats = new IrusStats(irusUKBaseURL); -// irusstats.getIrusRRReport(irusUKReportPath); - -// irusstats.processIrusStats(); -// log.info("irus done"); - // SarcStats sarcStats = new SarcStats(); // sarcStats.getAndProcessSarc(sarcsReportPathArray, sarcsReportPathNonArray); // sarcStats.finalizeSarcStats(); diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json index 55998aa9c..48cabfde7 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/export/usagestats_parameters.json @@ -148,6 +148,29 @@ "paramLongName": "downloadLaReferenciaLogs", "paramDescription": "download La Referencia logs?", "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "irusCreateTablesEmptyDirs", + "paramDescription": "Irus section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "irusDownloadReports", + "paramDescription": "Irus section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "irusNumberOfOpendoarsToDownload", + "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", + "paramRequired": true } ] - diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml index 758bba1ab..b14e9408f 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestats/oozie_app/workflow.xml @@ -67,6 +67,10 @@ --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} --downloadLaReferenciaLogs${downloadLaReferenciaLogs} --processLaReferenciaLogs${processLaReferenciaLogs} + --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} + --irusDownloadReports${irusDownloadReports} + --irusProcessStats${irusProcessStats} + --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload}