forked from D-Net/dnet-hadoop
Better file re-organisation for piwiklogs
This commit is contained in:
parent
7fdf994eb6
commit
1ca74ce830
|
@ -64,27 +64,22 @@ public class PiwikDownloadLogs {
|
||||||
|
|
||||||
private String getJson(String url) throws Exception {
|
private String getJson(String url) throws Exception {
|
||||||
try {
|
try {
|
||||||
logger.info("Connecting to download the JSON: " + url);
|
logger.debug("Connecting to download the JSON: " + url);
|
||||||
URL website = new URL(url);
|
URL website = new URL(url);
|
||||||
URLConnection connection = website.openConnection();
|
URLConnection connection = website.openConnection();
|
||||||
|
|
||||||
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
|
|
||||||
StringBuilder response;
|
StringBuilder response;
|
||||||
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
|
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
|
||||||
response = new StringBuilder();
|
response = new StringBuilder();
|
||||||
String inputLine;
|
String inputLine;
|
||||||
while ((inputLine = in.readLine()) != null) {
|
while ((inputLine = in.readLine()) != null) {
|
||||||
response.append(inputLine);
|
response.append(inputLine);
|
||||||
// response.append("\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System.out.println("response ====> " + response.toString());
|
|
||||||
|
|
||||||
return response.toString();
|
return response.toString();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Failed to get URL: " + e);
|
logger.error("Failed to get URL: " + url + " Exception: " + e);
|
||||||
throw new Exception("Failed to get URL: " + e.toString(), e);
|
throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,10 +100,11 @@ public class PiwikDownloadLogs {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
System.out
|
System.out
|
||||||
.println(
|
.println(
|
||||||
Thread.currentThread().getName() + " (Start) Thread for "
|
Thread.currentThread().getName() + " (Start) Thread for "
|
||||||
+ "parameters: currDay=" + currDay + ", siteId=" + siteId +
|
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
|
||||||
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
||||||
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
||||||
try {
|
try {
|
||||||
|
@ -121,7 +117,7 @@ public class PiwikDownloadLogs {
|
||||||
System.out
|
System.out
|
||||||
.println(
|
.println(
|
||||||
Thread.currentThread().getName() + " (End) Thread for "
|
Thread.currentThread().getName() + " (End) Thread for "
|
||||||
+ "parameters: currDay=" + currDay + ", siteId=" + siteId +
|
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
|
||||||
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
||||||
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
||||||
}
|
}
|
||||||
|
@ -151,9 +147,9 @@ public class PiwikDownloadLogs {
|
||||||
JSONParser parser = new JSONParser();
|
JSONParser parser = new JSONParser();
|
||||||
StringBuffer totalContent = new StringBuffer();
|
StringBuffer totalContent = new StringBuffer();
|
||||||
FileSystem fs = FileSystem.get(new Configuration());
|
FileSystem fs = FileSystem.get(new Configuration());
|
||||||
FSDataOutputStream fin = fs
|
|
||||||
.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
|
|
||||||
do {
|
do {
|
||||||
|
int writtenBytes = 0;
|
||||||
String apiUrl = baseApiUrl;
|
String apiUrl = baseApiUrl;
|
||||||
|
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
|
@ -164,23 +160,31 @@ public class PiwikDownloadLogs {
|
||||||
if (content.length() == 0 || content.equals("[]"))
|
if (content.length() == 0 || content.equals("[]"))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
FSDataOutputStream fin = fs
|
||||||
|
.create(
|
||||||
|
new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
|
||||||
|
+ ".json"),
|
||||||
|
true);
|
||||||
JSONArray jsonArray = (JSONArray) parser.parse(content);
|
JSONArray jsonArray = (JSONArray) parser.parse(content);
|
||||||
for (Object aJsonArray : jsonArray) {
|
for (Object aJsonArray : jsonArray) {
|
||||||
JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
|
JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
|
||||||
fin.write(jsonObjectRaw.toJSONString().getBytes());
|
byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
|
||||||
|
fin.write(jsonObjectRawBytes);
|
||||||
fin.writeChar('\n');
|
fin.writeChar('\n');
|
||||||
// totalContent.append(jsonObjectRaw.toJSONString());
|
|
||||||
// totalContent.append('\n');
|
writtenBytes += jsonObjectRawBytes.length + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fin.close();
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
|
||||||
|
+ " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
|
||||||
|
+ ".json");
|
||||||
|
|
||||||
i++;
|
i++;
|
||||||
} while (true);
|
} while (true);
|
||||||
|
|
||||||
// FileSystem fs = FileSystem.get(new Configuration());
|
|
||||||
// FSDataOutputStream fin = fs
|
|
||||||
// .create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
|
|
||||||
//
|
|
||||||
// fin.write(totalContent.toString().getBytes());
|
|
||||||
fin.close();
|
|
||||||
fs.close();
|
fs.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -221,7 +225,7 @@ public class PiwikDownloadLogs {
|
||||||
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
|
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
|
||||||
|
|
||||||
// FileSystem fs = FileSystem.get(new Configuration());
|
// FileSystem fs = FileSystem.get(new Configuration());
|
||||||
ExecutorService executor = Executors.newFixedThreadPool(20);// creating a pool of 5 threadsσ
|
ExecutorService executor = Executors.newFixedThreadPool(10);// creating a pool of 5 threadsσ
|
||||||
for (int siteId : piwikIdToVisit) {
|
for (int siteId : piwikIdToVisit) {
|
||||||
|
|
||||||
logger.info("Now working on piwikId: " + siteId);
|
logger.info("Now working on piwikId: " + siteId);
|
||||||
|
@ -242,7 +246,6 @@ public class PiwikDownloadLogs {
|
||||||
rs_date.close();
|
rs_date.close();
|
||||||
|
|
||||||
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
|
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
|
||||||
// Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID, fs);
|
|
||||||
Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
|
Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
|
||||||
executor.execute(worker);// calling execute method of ExecutorService
|
executor.execute(worker);// calling execute method of ExecutorService
|
||||||
}
|
}
|
||||||
|
|
|
@ -225,17 +225,6 @@ public class PiwikStatsDB {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// public void usageStats() throws Exception {
|
|
||||||
// try {
|
|
||||||
// viewsStats();
|
|
||||||
// downloadsStats();
|
|
||||||
// log.info("stat tables and views done");
|
|
||||||
// } catch (Exception e) {
|
|
||||||
// log.error("Failed to create usage usagestats: " + e);
|
|
||||||
// throw new Exception("Failed to create usage usagestats: " + e.toString(), e);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
public void processRepositoryLog() throws Exception {
|
public void processRepositoryLog() throws Exception {
|
||||||
|
|
||||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||||
|
|
|
@ -94,8 +94,6 @@ public class SarcStats {
|
||||||
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
|
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
|
||||||
logger.info("Added JSON Serde jar");
|
logger.info("Added JSON Serde jar");
|
||||||
|
|
||||||
// " + issn.replace("-", "_"
|
|
||||||
|
|
||||||
logger.info("Dropping sarc_sushilogtmp_json_array table");
|
logger.info("Dropping sarc_sushilogtmp_json_array table");
|
||||||
String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " +
|
String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " +
|
||||||
ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
|
ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
|
||||||
|
|
Loading…
Reference in New Issue