Corrections for correct download of OpenAIRE logs - Date limits

This commit is contained in:
Spyros Zoupanos 2020-10-04 10:19:44 +03:00
parent 7b7075cfdd
commit 2b330dd84c
1 changed files with 23 additions and 17 deletions

View File

@ -11,8 +11,10 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
@ -60,7 +62,7 @@ public class PiwikDownloadLogs {
private String getJson(String url) throws Exception {
try {
System.out.println("===> Connecting to: " + url);
logger.info("Connecting to download the JSON: " + url);
URL website = new URL(url);
URLConnection connection = website.openConnection();
@ -87,29 +89,32 @@ public class PiwikDownloadLogs {
public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
Statement statement = ConnectDB.getHiveConnection().createStatement();
// SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
ResultSet rs = statement
.executeQuery(
"SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id");
while (rs.next()) {
int siteId = rs.getInt(1);
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
// Calendar start = Calendar.getInstance();
// start.set(Calendar.YEAR, 2016);
// start.set(Calendar.MONTH, Calendar.MARCH);
// start.setTime(simpleDateFormat.parse("2016-01"));
Calendar start = ExecuteWorkflow.startingLogPeriod;
logger.info("GetOpenAIRELogs starting period: " + sdf.format(start.getTime()));
// Getting all the piwikids in a list for logging reasons
List<Integer> piwikIdToVisit = new ArrayList<Integer>();
while (rs.next())
piwikIdToVisit.add(rs.getInt(1));
logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
// Calendar end = Calendar.getInstance();
// end.add(Calendar.DAY_OF_MONTH, -1);
Calendar end = ExecuteWorkflow.endingLogPeriod;
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("GetOpenAIRELogs ending period: " + sdf.format(end.getTime()));
// Setting the starting period
Calendar start = ExecuteWorkflow.startingLogPeriod;
logger.info("Starting period for log download: " + sdf.format(start.getTime()));
// Setting the ending period (last day of the month)
Calendar end = ExecuteWorkflow.endingLogPeriod;
end.add(Calendar.MONTH, +1);
end.add(Calendar.DAY_OF_MONTH, -1);
logger.info("Starting period for log download: " + sdf.format(end.getTime()));
for (int siteId : piwikIdToVisit) {
logger.info("Now working on piwikId: " + siteId);
PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
.prepareStatement(
@ -126,7 +131,8 @@ public class PiwikDownloadLogs {
}
rs_date.close();
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) {
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
Date date = currDay.getTime();
logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
String period = "&period=day&date=" + sdf.format(date);