207 lines
7.1 KiB
Java
207 lines
7.1 KiB
Java
|
|
package eu.dnetlib.oa.graph.usagerawdata.export;
|
|
|
|
import java.io.IOException;
|
|
import java.sql.SQLException;
|
|
import java.sql.Statement;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
import org.apache.hadoop.fs.Path;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
/**
|
|
* Main class for downloading and processing Usage statistics
|
|
*
|
|
* @author D. Pierrakos, S. Zoupanos
|
|
*/
|
|
public class UsageStatsExporter {
|
|
|
|
public UsageStatsExporter() {
|
|
|
|
}
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
|
|
|
|
private void reCreateLogDirs() throws IllegalArgumentException, IOException {
|
|
FileSystem dfs = FileSystem.get(new Configuration());
|
|
|
|
logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
|
|
dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
|
|
|
|
logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
|
|
dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
|
|
|
|
logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
|
|
dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
|
|
|
|
logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
|
|
dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
|
|
|
|
logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
|
|
dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
|
|
|
|
logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
|
|
dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
|
|
}
|
|
|
|
public void export() throws Exception {
|
|
|
|
logger.info("Initialising DB properties");
|
|
ConnectDB.init();
|
|
|
|
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
|
|
|
|
logger.info("Re-creating database and tables");
|
|
if (ExecuteWorkflow.recreateDbAndTables) {
|
|
piwikstatsdb.recreateDBAndTables();
|
|
logger.info("DB-Tables-TmpTables are created ");
|
|
}
|
|
|
|
logger.info("Initializing the download logs module");
|
|
PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
|
|
|
|
if (ExecuteWorkflow.piwikEmptyDirs) {
|
|
logger.info("Recreating Piwik log directories");
|
|
piwikstatsdb.reCreateLogDirs();
|
|
}
|
|
|
|
// Downloading piwik logs (also managing directory creation)
|
|
if (ExecuteWorkflow.downloadPiwikLogs) {
|
|
logger.info("Downloading piwik logs");
|
|
piwd
|
|
.GetOpenAIRELogs(
|
|
ExecuteWorkflow.repoLogPath,
|
|
ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
|
|
}
|
|
logger.info("Downloaded piwik logs");
|
|
|
|
// Create DB tables, insert/update statistics
|
|
String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
|
|
piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
|
|
|
|
if (ExecuteWorkflow.processPiwikLogs) {
|
|
logger.info("Processing logs");
|
|
piwikstatsdb.processLogs();
|
|
}
|
|
|
|
logger.info("Creating LaReferencia tables");
|
|
LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
|
|
ExecuteWorkflow.lareferenciaAuthToken);
|
|
|
|
if (ExecuteWorkflow.laReferenciaEmptyDirs) {
|
|
logger.info("Recreating LaReferencia log directories");
|
|
lrf.reCreateLogDirs();
|
|
}
|
|
|
|
if (ExecuteWorkflow.downloadLaReferenciaLogs) {
|
|
logger.info("Downloading LaReferencia logs");
|
|
lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
|
|
logger.info("Downloaded LaReferencia logs");
|
|
}
|
|
|
|
LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
|
|
|
|
if (ExecuteWorkflow.processLaReferenciaLogs) {
|
|
logger.info("Processing LaReferencia logs");
|
|
lastats.processLogs();
|
|
logger.info("LaReferencia logs done");
|
|
}
|
|
|
|
IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
|
|
if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
|
|
logger.info("Creating Irus Stats tables");
|
|
irusstats.createTables();
|
|
logger.info("Created Irus Stats tables");
|
|
|
|
logger.info("Re-create log dirs");
|
|
irusstats.reCreateLogDirs();
|
|
logger.info("Re-created log dirs");
|
|
}
|
|
|
|
if (ExecuteWorkflow.irusDownloadReports) {
|
|
irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
|
|
}
|
|
|
|
if (ExecuteWorkflow.irusProcessStats) {
|
|
irusstats.processIrusStats();
|
|
logger.info("Irus done");
|
|
}
|
|
|
|
SarcStats sarcStats = new SarcStats();
|
|
if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
|
|
sarcStats.reCreateLogDirs();
|
|
}
|
|
if (ExecuteWorkflow.sarcDownloadReports) {
|
|
sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
|
|
}
|
|
|
|
if (ExecuteWorkflow.sarcProcessStats) {
|
|
sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
|
|
sarcStats.updateSarcLogs();
|
|
}
|
|
logger.info("Sarc done");
|
|
// finalize usagestats
|
|
|
|
logger.info("Dropping tmp tables");
|
|
if (ExecuteWorkflow.finalizeStats) {
|
|
piwikstatsdb.finalizeStats();
|
|
logger.info("Dropped tmp tables");
|
|
}
|
|
|
|
logger.info("Raw Data Download End");
|
|
}
|
|
|
|
public void createdDBWithTablesOnly() throws Exception {
|
|
logger.info("Initialising DB properties");
|
|
ConnectDB.init();
|
|
|
|
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
|
|
piwikstatsdb.recreateDBAndTables();
|
|
|
|
piwikstatsdb.createPedocsOldUsageData();
|
|
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
|
|
|
logger.info("Creating LaReferencia tables");
|
|
String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
|
|
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
|
|
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
|
|
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
|
|
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
|
|
+ "stored as orc tblproperties('transactional'='true')";
|
|
stmt.executeUpdate(sqlCreateTableLareferenciaLog);
|
|
logger.info("Created LaReferencia tables");
|
|
|
|
logger.info("Creating sushilog");
|
|
|
|
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
|
|
+ ".sushilog(source STRING, "
|
|
+ "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
|
|
+ "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
|
|
stmt.executeUpdate(sqlCreateTableSushiLog);
|
|
logger.info("Created sushilog");
|
|
|
|
logger.info("Updating piwiklog");
|
|
String sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
|
|
+ ".piwiklog select * from openaire_prod_usage_raw.piwiklog";
|
|
stmt.executeUpdate(sql);
|
|
|
|
logger.info("Updating lareferencialog");
|
|
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
|
|
+ ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog";
|
|
stmt.executeUpdate(sql);
|
|
|
|
logger.info("Updating sushilog");
|
|
sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
|
|
+ ".sushilog select * from openaire_prod_usage_raw.sushilog";
|
|
stmt.executeUpdate(sql);
|
|
|
|
stmt.close();
|
|
ConnectDB.getHiveConnection().close();
|
|
logger.info("Sushi Tables Created");
|
|
|
|
}
|
|
|
|
}
|