diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java index 394078ff8b..2dc2731c72 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/SarcStats.java @@ -11,8 +11,10 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Calendar; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; @@ -66,7 +68,8 @@ public class SarcStats { } } - public void processSarc() throws Exception { + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { Statement stmt = ConnectDB.getConnection().createStatement(); ConnectDB.getConnection().setAutoCommit(false); @@ -74,16 +77,16 @@ public class SarcStats { stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); System.out.println("====> Added JSON Serde jar"); - System.out.println("====> Dropping sarc_sushilogtmp_json_array table"); + System.out.println("====> Dropping sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + " table"); String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + - ".sarc_sushilogtmp_json_array"; + ".sarc_sushilogtmp_json_array_" + issn.replace("-", "_"); stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); - System.out.println("====> Dropped sarc_sushilogtmp_json_array table"); + System.out.println("====> Dropped sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + " table"); - System.out.println("====> Creating sarc_sushilogtmp_json_array table"); + System.out.println("====> Creating sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + " table"); String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + "(\n" + " `ItemIdentifier` ARRAY<\n" + " struct<\n" + " `Type`: STRING,\n" + @@ -102,21 +105,21 @@ public class SarcStats { " >\n" + ")" + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + UsageStatsExporter.sarcsReportPathArray + "'\n" + + "LOCATION '" + sarcsReportPathArray + "/" + issn + "'\n" + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_sarc_sushilogtmp_json_array); - System.out.println("====> Created sarc_sushilogtmp_json_array table"); + System.out.println("====> Created sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + " table"); - System.out.println("====> Dropping sarc_sushilogtmp_json_non_array table"); + System.out.println("====> Dropping sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_") + " table"); String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + - ".sarc_sushilogtmp_json_non_array"; + ".sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_"); stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); - System.out.println("====> Dropped sarc_sushilogtmp_json_non_array table"); + System.out.println("====> Dropped sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_") + " table"); - System.out.println("====> Creating sarc_sushilogtmp_json_non_array table"); + System.out.println("====> Creating sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_") + " table"); String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + - ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array(\n" + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_") + "(\n" + " `ItemIdentifier` struct<\n" + " `Type`: STRING,\n" + " `Value`: STRING\n" + @@ -133,94 +136,131 @@ public class SarcStats { " >" + ")" + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + - "LOCATION '" + UsageStatsExporter.sarcsReportPathNonArray + "'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/" + issn + "'\n" + "TBLPROPERTIES (\"transactional\"=\"false\")"; stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); - System.out.println("====> Created sarc_sushilogtmp_json_non_array table"); + System.out.println("====> Created sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_") + " table"); -// System.out.println("====> Dropping sarc_sushilogtmp table"); -// String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + -// ConnectDB.getUsageStatsDBSchema() + -// ".sushilogtmp"; -// stmt.executeUpdate(drop_sarc_sushilogtmp); -// System.out.println("====> Dropped sarc_sushilogtmp table"); + System.out.println("====> Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + System.out.println("====> Created sarc_sushilogtmp table"); + +// "INSERT INTO sushilog (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); + +// JSONObject itemPerformance = (JSONObject) jsonObjectRow.get("c:ItemPerformance"); +// // for (Object perf : itemPerformance) { +// JSONObject performance = (JSONObject) itemPerformance; +// JSONObject periodObj = (JSONObject) performance.get("c:Period"); +// String period = periodObj.get("c:Begin").toString(); +// JSONObject instanceObj = (JSONObject) performance.get("c:Instance"); +// String type = instanceObj.get("c:MetricType").toString(); +// String count = instanceObj.get("c:Count").toString(); +// // System.out.println(rid + " : " + period + " : " + count); // -// System.out.println("====> Creating sarc_sushilogtmp table"); -// String create_sarc_sushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() -// + ".sarc_sushilogtmp(source STRING, repository STRING, " + -// "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " -// + -// "tblproperties('transactional'='true')"; -// stmt.executeUpdate(create_sarc_sushilogtmp); -// System.out.println("====> Created sarc_sushilogtmp table"); -// -// System.out.println("====> Inserting to sarc_sushilogtmp table"); -// String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + -// "SELECT 'IRUS-UK', 'opendoar____::', `ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + -// "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + -// "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilogtmp_json " + -// "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + -// "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + -// "WHERE `ItemIdent`.`Type`= 'OAI'"; -// stmt.executeUpdate(insert_sarc_sushilogtmp); -// System.out.println("====> Inserted to sarc_sushilogtmp table"); +// preparedStatement.setString(1, "SARC-OJS"); +// preparedStatement.setString(2, issn); +// // preparedStatement.setString(2, url); +// preparedStatement.setString(3, rid); +// preparedStatement.setString(4, period); +// preparedStatement.setString(5, type); +// preparedStatement.setInt(6, Integer.parseInt(count)); + + System.out.println("====> Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', '" + issn + "' , `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array_" + issn.replace("-", "_") + " " + + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "; + stmt.executeUpdate(insert_sarc_sushilogtmp); + System.out.println("====> Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + + System.out.println("====> Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', '" + issn + "' , `ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array_" + issn.replace("-", "_"); + stmt.executeUpdate(insert_sarc_sushilogtmp); + System.out.println("====> Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); ConnectDB.getConnection().close(); } - public void getSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { - // There was a problem to download the following file - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", - "1646-107X"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", - "0873-819X"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", - "1646-2335"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", - "2182-3030"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://actapediatrica.spp.pt/sushiLite/v1_7/", - "0873-9781"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", - "0873-6529"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", - "0430-5027"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", - "2182-8474"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", - "0871-6099"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", - "0871-9187"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", - "1646-091X"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", - "2183-5799"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", - "1647-2098"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", - "0872-0754"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", - "1645-3794"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", - "1645-8826"); - getARReport( - sarcsReportPathArray, sarcsReportPathNonArray, "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", - "0873-3015"); + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + + Statement stmt = ConnectDB.getConnection().createStatement(); + ConnectDB.getConnection().setAutoCommit(false); + System.out.println("====> Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + System.out.println("====> Dropped sarc_sushilogtmp table"); + ConnectDB.getConnection().close(); + + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[] { + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); + + for (String[] issnAndUrl : issnAndUrls) { + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + processSarc(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } + } public void sarcStats() throws Exception { @@ -275,6 +315,10 @@ public class SarcStats { "INSERT INTO sushilog (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)"); int batch_size = 0; + // Creating the directory to save the files + new File(sarcsReportPathArray + "/" + issn).mkdirs(); + new File(sarcsReportPathNonArray + "/" + issn).mkdirs(); + while (start.before(end)) { // String reportUrl = // "http://irus.mimas.ac.uk/api/sushilite/v1_7/GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" @@ -329,19 +373,18 @@ public class SarcStats { // Creating the file in the filesystem for the ItemIdentifier as array object FileSystem fsArray = FileSystem.get(new Configuration()); - String filePathArray = sarcsReportPathArray + "/" + "SarcsARReport_" + issn + "_" + + String filePathArray = sarcsReportPathArray + "/" + issn + "/" + "SarcsARReport" + simpleDateFormat.format(start.getTime()) + ".json"; System.out.println("Storing to file: " + filePathArray); FSDataOutputStream finArray = fsArray.create(new Path(filePathArray), true); // Creating the file in the filesystem for the ItemIdentifier as array object FileSystem fsNonArray = FileSystem.get(new Configuration()); - String filePathNonArray = sarcsReportPathNonArray + "/" + "SarcsARReport_" + issn + "_" + + String filePathNonArray = sarcsReportPathNonArray + "/" + issn + "/" + "SarcsARReport" + simpleDateFormat.format(start.getTime()) + ".json"; System.out.println("Storing to file: " + filePathNonArray); FSDataOutputStream finNonArray = fsNonArray.create(new Path(filePathNonArray), true); - String rid = ""; for (Object aJsonArray : jsonArray) { JSONObject jsonObjectRow = (JSONObject) aJsonArray; @@ -357,15 +400,17 @@ public class SarcStats { } } - if (finArray.size() == 0) - fsArray.deleteOnExit(new Path(filePathArray)); - - if (finNonArray.size() == 0) - fsNonArray.deleteOnExit(new Path(filePathNonArray)); - finArray.close(); finNonArray.close(); + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) + fileArray.delete(); + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) + fileNonArray.delete(); + ////////////////// // JSONObject jsonObjectRow = (JSONObject) aJsonArray; // JSONArray itemIdentifier = new JSONArray(); diff --git a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java index 3d675ccfe7..950d8cff84 100644 --- a/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java +++ b/dhp-workflows/dhp-usage-stats-update/src/main/java/eu/dnetlib/oa/graph/usagestats/export/UsageStatsExporter.java @@ -60,8 +60,7 @@ public class UsageStatsExporter { // log.info("irus done"); SarcStats sarcStats = new SarcStats(); -// sarcStats.getSarc(sarcsReportPathArray, sarcsReportPathNonArray); - sarcStats.processSarc(); + sarcStats.getAndProcessSarc(sarcsReportPathArray, sarcsReportPathNonArray); // sarcStats.sarcStats(); log.info("sarc done");