2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-12 19:38:31 +02:00
|
|
|
package eu.dnetlib.oa.graph.usagestats.export;
|
2020-05-07 18:00:03 +02:00
|
|
|
|
|
|
|
import java.io.*;
|
2020-05-16 12:16:16 +02:00
|
|
|
import java.net.Authenticator;
|
|
|
|
import java.net.PasswordAuthentication;
|
2020-05-07 18:00:03 +02:00
|
|
|
import java.net.URL;
|
|
|
|
import java.net.URLConnection;
|
2020-05-16 12:16:16 +02:00
|
|
|
import java.security.cert.X509Certificate;
|
2020-05-07 18:00:03 +02:00
|
|
|
import java.sql.PreparedStatement;
|
|
|
|
import java.sql.ResultSet;
|
|
|
|
import java.sql.Statement;
|
|
|
|
import java.text.SimpleDateFormat;
|
2020-10-04 09:19:44 +02:00
|
|
|
import java.util.ArrayList;
|
2020-05-07 18:00:03 +02:00
|
|
|
import java.util.Calendar;
|
2020-05-07 20:46:14 +02:00
|
|
|
import java.util.Date;
|
2020-10-04 09:19:44 +02:00
|
|
|
import java.util.List;
|
2020-05-07 20:46:14 +02:00
|
|
|
|
2020-05-16 12:16:16 +02:00
|
|
|
import javax.net.ssl.HostnameVerifier;
|
|
|
|
import javax.net.ssl.HttpsURLConnection;
|
|
|
|
import javax.net.ssl.SSLContext;
|
|
|
|
import javax.net.ssl.SSLSession;
|
|
|
|
import javax.net.ssl.TrustManager;
|
|
|
|
import javax.net.ssl.X509TrustManager;
|
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
2020-05-21 20:49:33 +02:00
|
|
|
import org.json.simple.JSONArray;
|
|
|
|
import org.json.simple.JSONObject;
|
|
|
|
import org.json.simple.parser.JSONParser;
|
2020-10-02 15:25:21 +02:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-10-02 15:25:21 +02:00
|
|
|
/**
|
|
|
|
* @author D. Pierrakos, S. Zoupanos
|
|
|
|
*/
|
2020-05-07 18:00:03 +02:00
|
|
|
public class PiwikDownloadLogs {
|
|
|
|
|
|
|
|
private final String piwikUrl;
|
|
|
|
private Date startDate;
|
|
|
|
private final String tokenAuth;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The Piwik's API method
|
|
|
|
*/
|
|
|
|
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
|
|
|
|
private final String format = "&format=json";
|
|
|
|
|
2020-10-02 15:25:21 +02:00
|
|
|
private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
|
2020-05-07 18:00:03 +02:00
|
|
|
|
|
|
|
public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
|
|
|
|
this.piwikUrl = piwikUrl;
|
|
|
|
this.tokenAuth = tokenAuth;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
private String getPiwikLogUrl() {
|
|
|
|
return "https://" + piwikUrl + "/";
|
|
|
|
}
|
|
|
|
|
|
|
|
private String getJson(String url) throws Exception {
|
|
|
|
try {
|
2020-10-04 09:19:44 +02:00
|
|
|
logger.info("Connecting to download the JSON: " + url);
|
2020-05-07 18:00:03 +02:00
|
|
|
URL website = new URL(url);
|
|
|
|
URLConnection connection = website.openConnection();
|
|
|
|
|
|
|
|
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
|
|
|
|
StringBuilder response;
|
|
|
|
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
|
|
|
|
response = new StringBuilder();
|
|
|
|
String inputLine;
|
|
|
|
while ((inputLine = in.readLine()) != null) {
|
|
|
|
response.append(inputLine);
|
2020-05-21 20:49:33 +02:00
|
|
|
// response.append("\n");
|
2020-05-07 18:00:03 +02:00
|
|
|
}
|
|
|
|
}
|
2020-05-21 20:49:33 +02:00
|
|
|
|
2020-09-04 18:49:07 +02:00
|
|
|
// System.out.println("response ====> " + response.toString());
|
2020-05-21 20:49:33 +02:00
|
|
|
|
2020-05-07 18:00:03 +02:00
|
|
|
return response.toString();
|
|
|
|
} catch (Exception e) {
|
2020-10-02 15:25:21 +02:00
|
|
|
logger.error("Failed to get URL: " + e);
|
2020-05-07 18:00:03 +02:00
|
|
|
throw new Exception("Failed to get URL: " + e.toString(), e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
|
|
|
|
|
2020-09-27 12:19:45 +02:00
|
|
|
Statement statement = ConnectDB.getHiveConnection().createStatement();
|
2020-10-03 23:24:55 +02:00
|
|
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
2020-05-07 20:46:14 +02:00
|
|
|
|
2020-09-01 21:06:16 +02:00
|
|
|
ResultSet rs = statement
|
2020-09-02 20:02:56 +02:00
|
|
|
.executeQuery(
|
|
|
|
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
|
|
|
|
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
|
2020-10-04 09:19:44 +02:00
|
|
|
|
2020-10-04 16:03:01 +02:00
|
|
|
// Getting all the piwikids in a list for logging reasons & limitting the list
|
|
|
|
// to the max number of piwikids
|
2020-10-04 09:19:44 +02:00
|
|
|
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
|
|
|
|
while (rs.next())
|
|
|
|
piwikIdToVisit.add(rs.getInt(1));
|
2020-10-04 16:03:01 +02:00
|
|
|
logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
|
|
|
|
|
|
|
|
if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 &&
|
|
|
|
ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
|
|
|
|
logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
|
|
|
piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
|
|
|
|
}
|
|
|
|
|
2020-10-04 09:19:44 +02:00
|
|
|
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
|
|
|
|
|
|
|
|
// Setting the starting period
|
2020-10-05 18:09:31 +02:00
|
|
|
Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
|
2020-10-04 09:19:44 +02:00
|
|
|
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
|
|
|
|
|
|
|
|
// Setting the ending period (last day of the month)
|
2020-10-05 18:09:31 +02:00
|
|
|
Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
|
2020-10-04 09:19:44 +02:00
|
|
|
end.add(Calendar.MONTH, +1);
|
|
|
|
end.add(Calendar.DAY_OF_MONTH, -1);
|
2020-10-05 18:09:31 +02:00
|
|
|
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
|
2020-10-04 09:19:44 +02:00
|
|
|
|
|
|
|
for (int siteId : piwikIdToVisit) {
|
|
|
|
|
|
|
|
logger.info("Now working on piwikId: " + siteId);
|
2020-05-07 20:46:14 +02:00
|
|
|
|
2020-09-27 12:19:45 +02:00
|
|
|
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
|
2020-05-07 20:46:14 +02:00
|
|
|
.prepareStatement(
|
2020-05-14 21:27:18 +02:00
|
|
|
"SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
|
2020-05-16 12:16:16 +02:00
|
|
|
+ ".piwiklog WHERE source=? GROUP BY timestamp HAVING max(timestamp) is not null");
|
2020-05-07 20:46:14 +02:00
|
|
|
st.setInt(1, siteId);
|
|
|
|
|
|
|
|
ResultSet rs_date = st.executeQuery();
|
|
|
|
while (rs_date.next()) {
|
|
|
|
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
|
|
|
|
&& !rs_date.getString(1).equals("")) {
|
|
|
|
start.setTime(sdf.parse(rs_date.getString(1)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rs_date.close();
|
|
|
|
|
2020-10-04 09:19:44 +02:00
|
|
|
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
|
|
|
|
Date date = currDay.getTime();
|
2020-10-02 15:25:21 +02:00
|
|
|
logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
|
2020-05-07 20:46:14 +02:00
|
|
|
|
|
|
|
String period = "&period=day&date=" + sdf.format(date);
|
|
|
|
String outFolder = "";
|
|
|
|
// portal siteId = 109;
|
|
|
|
if (siteId == Integer.parseInt(portalMatomoID)) {
|
|
|
|
outFolder = portalLogPath;
|
|
|
|
} else {
|
|
|
|
outFolder = repoLogsPath;
|
|
|
|
}
|
|
|
|
FileSystem fs = FileSystem.get(new Configuration());
|
|
|
|
FSDataOutputStream fin = fs
|
|
|
|
.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
|
|
|
|
+ "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
|
|
|
|
String content = "";
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
int i = 0;
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-21 20:49:33 +02:00
|
|
|
JSONParser parser = new JSONParser();
|
|
|
|
while (!content.equals("[]")) {
|
2020-05-07 20:46:14 +02:00
|
|
|
String apiUrl = baseApiUrl;
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
if (i > 0) {
|
|
|
|
apiUrl += "&filter_offset=" + (i * 1000);
|
|
|
|
}
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-19 17:45:28 +02:00
|
|
|
content = getJson(apiUrl);
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-21 20:49:33 +02:00
|
|
|
JSONArray jsonArray = (JSONArray) parser.parse(content);
|
|
|
|
for (Object aJsonArray : jsonArray) {
|
|
|
|
JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
|
|
|
|
fin.write(jsonObjectRaw.toJSONString().getBytes());
|
|
|
|
fin.writeChar('\n');
|
|
|
|
}
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
i++;
|
|
|
|
}
|
|
|
|
fin.close();
|
|
|
|
}
|
2020-05-07 18:00:03 +02:00
|
|
|
|
2020-05-07 20:46:14 +02:00
|
|
|
}
|
|
|
|
}
|
2020-05-07 18:00:03 +02:00
|
|
|
}
|