package eu.dnetlib.oa.graph.usagestats.export; import java.io.*; import java.net.Authenticator; import java.net.PasswordAuthentication; import java.net.URL; import java.net.URLConnection; import java.security.cert.X509Certificate; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.Statement; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author D. Pierrakos, S. Zoupanos */ public class PiwikDownloadLogs { private final String piwikUrl; private Date startDate; private final String tokenAuth; /* * The Piwik's API method */ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; private final String format = "&format=json"; private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { this.piwikUrl = piwikUrl; this.tokenAuth = tokenAuth; } private String getPiwikLogUrl() { return "https://" + piwikUrl + "/"; } private String getJson(String url) throws Exception { try { logger.info("Connecting to download the JSON: " + url); URL website = new URL(url); URLConnection connection = website.openConnection(); // connection.setRequestProperty ("Authorization", "Basic "+encoded); StringBuilder response; try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { response = new StringBuilder(); String inputLine; while ((inputLine = in.readLine()) != null) { response.append(inputLine); // response.append("\n"); } } // System.out.println("response ====> " + response.toString()); return response.toString(); } catch (Exception e) { logger.error("Failed to get URL: " + e); throw new Exception("Failed to get URL: " + e.toString(), e); } } public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { Statement statement = ConnectDB.getHiveConnection().createStatement(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); ResultSet rs = statement .executeQuery( "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); // Getting all the piwikids in a list for logging reasons & limitting the list // to the max number of piwikids List piwikIdToVisit = new ArrayList(); while (rs.next()) piwikIdToVisit.add(rs.getInt(1)); logger.info("Found the following piwikIds for download: " + piwikIdToVisit); if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); } logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); // Setting the starting period Calendar start = ExecuteWorkflow.startingLogPeriod; logger.info("Starting period for log download: " + sdf.format(start.getTime())); // Setting the ending period (last day of the month) Calendar end = ExecuteWorkflow.endingLogPeriod; end.add(Calendar.MONTH, +1); end.add(Calendar.DAY_OF_MONTH, -1); logger.info("Starting period for log download: " + sdf.format(end.getTime())); for (int siteId : piwikIdToVisit) { logger.info("Now working on piwikId: " + siteId); PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION .prepareStatement( "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog WHERE source=? GROUP BY timestamp HAVING max(timestamp) is not null"); st.setInt(1, siteId); ResultSet rs_date = st.executeQuery(); while (rs_date.next()) { if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") && !rs_date.getString(1).equals("")) { start.setTime(sdf.parse(rs_date.getString(1))); } } rs_date.close(); for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { Date date = currDay.getTime(); logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); String period = "&period=day&date=" + sdf.format(date); String outFolder = ""; // portal siteId = 109; if (siteId == Integer.parseInt(portalMatomoID)) { outFolder = portalLogPath; } else { outFolder = repoLogsPath; } FileSystem fs = FileSystem.get(new Configuration()); FSDataOutputStream fin = fs .create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true); String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; String content = ""; int i = 0; JSONParser parser = new JSONParser(); while (!content.equals("[]")) { String apiUrl = baseApiUrl; if (i > 0) { apiUrl += "&filter_offset=" + (i * 1000); } content = getJson(apiUrl); JSONArray jsonArray = (JSONArray) parser.parse(content); for (Object aJsonArray : jsonArray) { JSONObject jsonObjectRaw = (JSONObject) aJsonArray; fin.write(jsonObjectRaw.toJSONString().getBytes()); fin.writeChar('\n'); } i++; } fin.close(); } } } }