Better file re-organisation for piwiklogs

This commit is contained in:
Spyros Zoupanos 2020-10-20 18:05:03 +03:00
parent 7fdf994eb6
commit 1ca74ce830
3 changed files with 26 additions and 36 deletions

View File

@ -64,27 +64,22 @@ public class PiwikDownloadLogs {
private String getJson(String url) throws Exception { private String getJson(String url) throws Exception {
try { try {
logger.info("Connecting to download the JSON: " + url); logger.debug("Connecting to download the JSON: " + url);
URL website = new URL(url); URL website = new URL(url);
URLConnection connection = website.openConnection(); URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response; StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder(); response = new StringBuilder();
String inputLine; String inputLine;
while ((inputLine = in.readLine()) != null) { while ((inputLine = in.readLine()) != null) {
response.append(inputLine); response.append(inputLine);
// response.append("\n");
} }
} }
// System.out.println("response ====> " + response.toString());
return response.toString(); return response.toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to get URL: " + e); logger.error("Failed to get URL: " + url + " Exception: " + e);
throw new Exception("Failed to get URL: " + e.toString(), e); throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
} }
} }
@ -105,10 +100,11 @@ public class PiwikDownloadLogs {
} }
public void run() { public void run() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (Start) Thread for " Thread.currentThread().getName() + " (Start) Thread for "
+ "parameters: currDay=" + currDay + ", siteId=" + siteId + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
try { try {
@ -121,7 +117,7 @@ public class PiwikDownloadLogs {
System.out System.out
.println( .println(
Thread.currentThread().getName() + " (End) Thread for " Thread.currentThread().getName() + " (End) Thread for "
+ "parameters: currDay=" + currDay + ", siteId=" + siteId + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
} }
@ -151,9 +147,9 @@ public class PiwikDownloadLogs {
JSONParser parser = new JSONParser(); JSONParser parser = new JSONParser();
StringBuffer totalContent = new StringBuffer(); StringBuffer totalContent = new StringBuffer();
FileSystem fs = FileSystem.get(new Configuration()); FileSystem fs = FileSystem.get(new Configuration());
FSDataOutputStream fin = fs
.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
do { do {
int writtenBytes = 0;
String apiUrl = baseApiUrl; String apiUrl = baseApiUrl;
if (i > 0) { if (i > 0) {
@ -164,23 +160,31 @@ public class PiwikDownloadLogs {
if (content.length() == 0 || content.equals("[]")) if (content.length() == 0 || content.equals("[]"))
break; break;
FSDataOutputStream fin = fs
.create(
new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json"),
true);
JSONArray jsonArray = (JSONArray) parser.parse(content); JSONArray jsonArray = (JSONArray) parser.parse(content);
for (Object aJsonArray : jsonArray) { for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRaw = (JSONObject) aJsonArray; JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
fin.write(jsonObjectRaw.toJSONString().getBytes()); byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
fin.write(jsonObjectRawBytes);
fin.writeChar('\n'); fin.writeChar('\n');
// totalContent.append(jsonObjectRaw.toJSONString());
// totalContent.append('\n'); writtenBytes += jsonObjectRawBytes.length + 1;
} }
fin.close();
System.out
.println(
Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ ".json");
i++; i++;
} while (true); } while (true);
// FileSystem fs = FileSystem.get(new Configuration());
// FSDataOutputStream fin = fs
// .create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
//
// fin.write(totalContent.toString().getBytes());
fin.close();
fs.close(); fs.close();
} }
} }
@ -221,7 +225,7 @@ public class PiwikDownloadLogs {
logger.info("Ending period for log download: " + sdf.format(end.getTime())); logger.info("Ending period for log download: " + sdf.format(end.getTime()));
// FileSystem fs = FileSystem.get(new Configuration()); // FileSystem fs = FileSystem.get(new Configuration());
ExecutorService executor = Executors.newFixedThreadPool(20);// creating a pool of 5 threadsσ ExecutorService executor = Executors.newFixedThreadPool(10);// creating a pool of 5 threadsσ
for (int siteId : piwikIdToVisit) { for (int siteId : piwikIdToVisit) {
logger.info("Now working on piwikId: " + siteId); logger.info("Now working on piwikId: " + siteId);
@ -242,7 +246,6 @@ public class PiwikDownloadLogs {
rs_date.close(); rs_date.close();
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
// Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID, fs);
Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
executor.execute(worker);// calling execute method of ExecutorService executor.execute(worker);// calling execute method of ExecutorService
} }

View File

@ -225,17 +225,6 @@ public class PiwikStatsDB {
} }
} }
// public void usageStats() throws Exception {
// try {
// viewsStats();
// downloadsStats();
// log.info("stat tables and views done");
// } catch (Exception e) {
// log.error("Failed to create usage usagestats: " + e);
// throw new Exception("Failed to create usage usagestats: " + e.toString(), e);
// }
// }
public void processRepositoryLog() throws Exception { public void processRepositoryLog() throws Exception {
Statement stmt = ConnectDB.getHiveConnection().createStatement(); Statement stmt = ConnectDB.getHiveConnection().createStatement();

View File

@ -94,8 +94,6 @@ public class SarcStats {
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
logger.info("Added JSON Serde jar"); logger.info("Added JSON Serde jar");
// " + issn.replace("-", "_"
logger.info("Dropping sarc_sushilogtmp_json_array table"); logger.info("Dropping sarc_sushilogtmp_json_array table");
String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " +
ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";