dnet-hadoop/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/PiwikDownloadLogs.java

190 lines
6.0 KiB
Java

package eu.dnetlib.oa.graph.usagestats.export;
import java.io.*;
import java.net.Authenticator;
import java.net.PasswordAuthentication;
import java.net.URL;
import java.net.URLConnection;
import java.security.cert.X509Certificate;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author D. Pierrakos, S. Zoupanos
*/
public class PiwikDownloadLogs {
private final String piwikUrl;
private Date startDate;
private final String tokenAuth;
/*
* The Piwik's API method
*/
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
private final String format = "&format=json";
private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
this.piwikUrl = piwikUrl;
this.tokenAuth = tokenAuth;
}
private String getPiwikLogUrl() {
return "https://" + piwikUrl + "/";
}
private String getJson(String url) throws Exception {
try {
logger.info("Connecting to download the JSON: " + url);
URL website = new URL(url);
URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
// response.append("\n");
}
}
// System.out.println("response ====> " + response.toString());
return response.toString();
} catch (Exception e) {
logger.error("Failed to get URL: " + e);
throw new Exception("Failed to get URL: " + e.toString(), e);
}
}
public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
Statement statement = ConnectDB.getHiveConnection().createStatement();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
ResultSet rs = statement
.executeQuery(
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
// Getting all the piwikids in a list for logging reasons & limitting the list
// to the max number of piwikids
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
while (rs.next())
piwikIdToVisit.add(rs.getInt(1));
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
}
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
// Setting the starting period
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
for (int siteId : piwikIdToVisit) {
logger.info("Now working on piwikId: " + siteId);
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
.prepareStatement(
"SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ ".piwiklog WHERE source=? GROUP BY timestamp HAVING max(timestamp) is not null");
st.setInt(1, siteId);
ResultSet rs_date = st.executeQuery();
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
}
}
rs_date.close();
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
Date date = currDay.getTime();
logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
String period = "&period=day&date=" + sdf.format(date);
String outFolder = "";
// portal siteId = 109;
if (siteId == Integer.parseInt(portalMatomoID)) {
outFolder = portalLogPath;
} else {
outFolder = repoLogsPath;
}
FileSystem fs = FileSystem.get(new Configuration());
FSDataOutputStream fin = fs
.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String content = "";
int i = 0;
JSONParser parser = new JSONParser();
while (!content.equals("[]")) {
String apiUrl = baseApiUrl;
if (i > 0) {
apiUrl += "&filter_offset=" + (i * 1000);
}
content = getJson(apiUrl);
JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
fin.write(jsonObjectRaw.toJSONString().getBytes());
fin.writeChar('\n');
}
i++;
}
fin.close();
}
}
}
}