forked from D-Net/dnet-hadoop
Better file re-organisation for piwiklogs
This commit is contained in:
parent
7fdf994eb6
commit
1ca74ce830
|
@ -64,27 +64,22 @@ public class PiwikDownloadLogs {
|
|||
|
||||
private String getJson(String url) throws Exception {
|
||||
try {
|
||||
logger.info("Connecting to download the JSON: " + url);
|
||||
logger.debug("Connecting to download the JSON: " + url);
|
||||
URL website = new URL(url);
|
||||
URLConnection connection = website.openConnection();
|
||||
|
||||
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
|
||||
StringBuilder response;
|
||||
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
|
||||
response = new StringBuilder();
|
||||
String inputLine;
|
||||
while ((inputLine = in.readLine()) != null) {
|
||||
response.append(inputLine);
|
||||
// response.append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("response ====> " + response.toString());
|
||||
|
||||
return response.toString();
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to get URL: " + e);
|
||||
throw new Exception("Failed to get URL: " + e.toString(), e);
|
||||
logger.error("Failed to get URL: " + url + " Exception: " + e);
|
||||
throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -105,10 +100,11 @@ public class PiwikDownloadLogs {
|
|||
}
|
||||
|
||||
public void run() {
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
System.out
|
||||
.println(
|
||||
Thread.currentThread().getName() + " (Start) Thread for "
|
||||
+ "parameters: currDay=" + currDay + ", siteId=" + siteId +
|
||||
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
|
||||
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
||||
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
||||
try {
|
||||
|
@ -121,7 +117,7 @@ public class PiwikDownloadLogs {
|
|||
System.out
|
||||
.println(
|
||||
Thread.currentThread().getName() + " (End) Thread for "
|
||||
+ "parameters: currDay=" + currDay + ", siteId=" + siteId +
|
||||
+ "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId +
|
||||
", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath +
|
||||
", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
|
||||
}
|
||||
|
@ -151,9 +147,9 @@ public class PiwikDownloadLogs {
|
|||
JSONParser parser = new JSONParser();
|
||||
StringBuffer totalContent = new StringBuffer();
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
FSDataOutputStream fin = fs
|
||||
.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
|
||||
|
||||
do {
|
||||
int writtenBytes = 0;
|
||||
String apiUrl = baseApiUrl;
|
||||
|
||||
if (i > 0) {
|
||||
|
@ -164,23 +160,31 @@ public class PiwikDownloadLogs {
|
|||
if (content.length() == 0 || content.equals("[]"))
|
||||
break;
|
||||
|
||||
FSDataOutputStream fin = fs
|
||||
.create(
|
||||
new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
|
||||
+ ".json"),
|
||||
true);
|
||||
JSONArray jsonArray = (JSONArray) parser.parse(content);
|
||||
for (Object aJsonArray : jsonArray) {
|
||||
JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
|
||||
fin.write(jsonObjectRaw.toJSONString().getBytes());
|
||||
byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
|
||||
fin.write(jsonObjectRawBytes);
|
||||
fin.writeChar('\n');
|
||||
// totalContent.append(jsonObjectRaw.toJSONString());
|
||||
// totalContent.append('\n');
|
||||
|
||||
writtenBytes += jsonObjectRawBytes.length + 1;
|
||||
}
|
||||
|
||||
fin.close();
|
||||
System.out
|
||||
.println(
|
||||
Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
|
||||
+ " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
|
||||
+ ".json");
|
||||
|
||||
i++;
|
||||
} while (true);
|
||||
|
||||
// FileSystem fs = FileSystem.get(new Configuration());
|
||||
// FSDataOutputStream fin = fs
|
||||
// .create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
|
||||
//
|
||||
// fin.write(totalContent.toString().getBytes());
|
||||
fin.close();
|
||||
fs.close();
|
||||
}
|
||||
}
|
||||
|
@ -221,7 +225,7 @@ public class PiwikDownloadLogs {
|
|||
logger.info("Ending period for log download: " + sdf.format(end.getTime()));
|
||||
|
||||
// FileSystem fs = FileSystem.get(new Configuration());
|
||||
ExecutorService executor = Executors.newFixedThreadPool(20);// creating a pool of 5 threadsσ
|
||||
ExecutorService executor = Executors.newFixedThreadPool(10);// creating a pool of 5 threadsσ
|
||||
for (int siteId : piwikIdToVisit) {
|
||||
|
||||
logger.info("Now working on piwikId: " + siteId);
|
||||
|
@ -242,7 +246,6 @@ public class PiwikDownloadLogs {
|
|||
rs_date.close();
|
||||
|
||||
for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
|
||||
// Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID, fs);
|
||||
Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
|
||||
executor.execute(worker);// calling execute method of ExecutorService
|
||||
}
|
||||
|
|
|
@ -225,17 +225,6 @@ public class PiwikStatsDB {
|
|||
}
|
||||
}
|
||||
|
||||
// public void usageStats() throws Exception {
|
||||
// try {
|
||||
// viewsStats();
|
||||
// downloadsStats();
|
||||
// log.info("stat tables and views done");
|
||||
// } catch (Exception e) {
|
||||
// log.error("Failed to create usage usagestats: " + e);
|
||||
// throw new Exception("Failed to create usage usagestats: " + e.toString(), e);
|
||||
// }
|
||||
// }
|
||||
|
||||
public void processRepositoryLog() throws Exception {
|
||||
|
||||
Statement stmt = ConnectDB.getHiveConnection().createStatement();
|
||||
|
|
|
@ -94,8 +94,6 @@ public class SarcStats {
|
|||
stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
|
||||
logger.info("Added JSON Serde jar");
|
||||
|
||||
// " + issn.replace("-", "_"
|
||||
|
||||
logger.info("Dropping sarc_sushilogtmp_json_array table");
|
||||
String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " +
|
||||
ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
|
||||
|
|
Loading…
Reference in New Issue