Adding the main java files, the directory structure and main workflow file

This commit is contained in:
Spyros Zoupanos 2020-05-07 19:00:03 +03:00
parent ac0da5a7ee
commit af62b14f91
13 changed files with 2244 additions and 0 deletions

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.7-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-stats-update</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,66 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.usagestats.export;
/*
@author dpie
*/
/*
@author dpie
*/
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Properties;
import org.apache.log4j.Logger;
public abstract class ConnectDB {
private static Connection DB_CONNECTION;
private static String dbURL;
private static String dbUsername;
private static String dbPassword;
private static String defaultDBSchema;
private final static Logger log = Logger.getLogger(ConnectDB.class);
static void init(Properties properties) throws ClassNotFoundException {
dbURL = properties.getProperty("Stats_db_Url");
dbUsername = properties.getProperty("Stats_db_User");
dbPassword = properties.getProperty("Stats_db_Pass");
defaultDBSchema = properties.getProperty("Stats_db_Schema");
Class.forName(properties.getProperty("Stats_db_Driver"));
}
public static Connection getConnection() throws SQLException {
if (DB_CONNECTION != null && !DB_CONNECTION.isClosed()) {
return DB_CONNECTION;
} else {
DB_CONNECTION = connect();
return DB_CONNECTION;
}
}
private static Connection connect() throws SQLException {
Connection connection = DriverManager.getConnection(dbURL, dbUsername, dbPassword);
Statement stmt = connection.createStatement();
String sqlSetSearchPath = "SET search_path TO " + defaultDBSchema + ";";
stmt.executeUpdate(sqlSetSearchPath);
stmt.close();
log.debug("Opened database successfully");
return connection;
}
}

View File

@ -0,0 +1,43 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.usagestats.export;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author dpie
*/
public class ExecuteWorkflow {
public static void main(String args[]) throws Exception {
Properties prop = new Properties();
InputStream propertiesInputStream = UsageStatsExporter.class
.getClassLoader()
.getResourceAsStream("usagestats.properties");
prop.load(propertiesInputStream);
UsageStatsExporter usagestatsExport = new UsageStatsExporter(prop);
usagestatsExport.export();
}
}

View File

@ -0,0 +1,431 @@
package eu.dnetlib.usagestats.export;
/**
*
* @author dpie
*/
/**
* @author dpie
*/
import java.io.*;
// import java.io.BufferedReader;
// import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import org.apache.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
/**
* Created by dpie on 20/01/2020.
*/
public class IrusStats {
private String irusUKURL;
// private Connection conn = null;
// private Statement stmt = null;
private final Logger log = Logger.getLogger(this.getClass());
public IrusStats(String irusUKURL) throws Exception {
this.irusUKURL = irusUKURL;
createTables();
createTmpTables();
}
private void createTables() throws Exception {
try {
Statement stmt = ConnectDB.getConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
stmt.executeUpdate(sqlCreateTableSushiLog);
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO sushilog "
+ " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ "sushilog.rid, sushilog.date "
+ "FROM sushilog "
+ "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
stmt.executeUpdate(sqlcreateRuleSushiLog);
String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
stmt.executeUpdate(createSushiIndex);
stmt.close();
ConnectDB.getConnection().close();
log.info("Sushi Tables Created");
} catch (Exception e) {
log.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
private void createTmpTables() throws Exception {
try {
Statement stmt = ConnectDB.getConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilogtmp(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
stmt.executeUpdate(sqlCreateTableSushiLog);
// stmt.executeUpdate("CREATE TABLE IF NOT EXISTS public.sushilog AS TABLE sushilog;");
// String sqlCopyPublicSushiLog = "INSERT INTO sushilog SELECT * FROM public.sushilog;";
// stmt.executeUpdate(sqlCopyPublicSushiLog);
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO sushilogtmp "
+ " WHERE (EXISTS ( SELECT sushilogtmp.source, sushilogtmp.repository,"
+ "sushilogtmp.rid, sushilogtmp.date "
+ "FROM sushilogtmp "
+ "WHERE sushilogtmp.source = new.source AND sushilogtmp.repository = new.repository AND sushilogtmp.rid = new.rid AND sushilogtmp.date = new.date AND sushilogtmp.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
stmt.executeUpdate(sqlcreateRuleSushiLog);
stmt.close();
ConnectDB.getConnection().close();
log.info("Sushi Tmp Tables Created");
} catch (Exception e) {
log.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
public void irusStats() throws Exception {
Statement stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
// String sql = "INSERT INTO sushi_result_downloads SELECT s.source, d.id AS repository, ro.id, s.date, s.count
// FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND s.oai=ro.orid AND
// metric_type='ft_total'";
// String sql = "SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date)
// ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count INTO
// downloads_stats FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND
// s.oai=ro.orid AND metric_type='ft_total'";
// String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id,
// extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0')
// as date, s.count FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND
// s.oai=ro.orid AND metric_type='ft_total';";
String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' FROM sushilogtmp s, public.datasource_oids d, public.result_oids ro WHERE s.repository=d.orid AND s.rid=ro.orid AND metric_type='ft_total' AND s.source='IRUS-UK';";
stmt.executeUpdate(sql);
sql = "Insert into sushilog select * from sushilogtmp;";
stmt.executeUpdate(sql);
ConnectDB.getConnection().commit();
ConnectDB.getConnection().close();
}
public void processIrusRRReport() throws Exception {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
// String reportUrl = "https://irus.jisc.ac.uk" +
// "/api/sushilite/v1_7/GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=2016-01&EndDate=" +
// simpleDateFormat.format(new Date()) +
// "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=2016-01&EndDate="
+ simpleDateFormat.format(new Date())
+ "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
log.info("Getting Irus report: " + reportUrl);
String text = getJson(reportUrl, "", "");
log.info("Report: " + text);
JSONParser parser = new JSONParser();
JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("ReportResponse");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
int i = 0;
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
for (Object identifier : itemIdentifier) {
JSONObject opendoar = (JSONObject) identifier;
if (opendoar.get("Type").toString().equals("OpenDOAR")) {
// System.out.println(i + ": " + opendoar.get("Value").toString());
log.info(i + ": " + opendoar.get("Value").toString());
i++;
processIrusIRReport(opendoar.get("Value").toString());
break;
}
}
// break;
}
}
private void processIrusIRReport(String opendoar) throws Exception {
System.out.println(opendoar);
ConnectDB.getConnection().setAutoCommit(false);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2016);
start.set(Calendar.MONTH, Calendar.JANUARY);
// start.setTime(simpleDateFormat.parse("2016-01"));
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
PreparedStatement st = ConnectDB
.getConnection()
.prepareStatement("SELECT max(date) FROM sushilog WHERE repository=?;");
st.setString(1, "opendoar____::" + opendoar);
ResultSet rs_date = st.executeQuery();
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
}
}
rs_date.close();
PreparedStatement preparedStatement = ConnectDB
.getConnection()
.prepareStatement(
"INSERT INTO sushilogtmp (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)");
int batch_size = 0;
while (start.before(end)) {
// log.info("date: " + simpleDateFormat.format(start.getTime()));
String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ "&RepositoryIdentifier=opendoar%3A" + opendoar
+ "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
start.add(Calendar.MONTH, 1);
String text = getJson(reportUrl, "", "");
if (text == null) {
continue;
}
JSONParser parser = new JSONParser();
JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("ReportResponse");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
if (jsonArray == null) {
continue;
}
String oai = "";
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
for (Object identifier : itemIdentifier) {
JSONObject oaiPmh = (JSONObject) identifier;
if (oaiPmh.get("Type").toString().equals("OAI")) {
oai = oaiPmh.get("Value").toString();
// System.out.println("OAI: " + oai);
break;
}
}
JSONArray itemPerformance = (JSONArray) jsonObjectRow.get("ItemPerformance");
String period;
String type;
String count;
for (Object perf : itemPerformance) {
JSONObject performance = (JSONObject) perf;
JSONObject periodObj = (JSONObject) performance.get("Period");
period = periodObj.get("Begin").toString();
JSONObject instanceObj = (JSONObject) performance.get("Instance");
type = instanceObj.get("MetricType").toString();
count = instanceObj.get("Count").toString();
// System.out.println(oai + " : " + period + " : " + count);
preparedStatement.setString(1, "IRUS-UK");
preparedStatement.setString(2, "opendoar____::" + opendoar);
preparedStatement.setString(3, oai);
preparedStatement.setString(4, period);
preparedStatement.setString(5, type);
preparedStatement.setInt(6, Integer.parseInt(count));
preparedStatement.addBatch();
batch_size++;
if (batch_size == 10000) {
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
batch_size = 0;
}
}
// break;
}
// break;
}
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
ConnectDB.getConnection().close();
}
public void processIrusIRReport(String opendoar, String startDate) throws Exception {
ConnectDB.getConnection().setAutoCommit(false);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2016);
start.set(Calendar.MONTH, Calendar.JANUARY);
// start.setTime(simpleDateFormat.parse("2016-01"));
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
start.setTime(sdf.parse(startDate));
String createTablesQuery = "-- Table: shadow.sushilog" + opendoar + "\n"
+ "\n"
+ "-- DROP TABLE shadow.sushilog" + opendoar + ";\n"
+ "\n"
+ "CREATE TABLE shadow.sushilog" + opendoar + "\n"
+ "(\n"
+ " source text COLLATE pg_catalog.\"default\" NOT NULL,\n"
+ " repository text COLLATE pg_catalog.\"default\" NOT NULL,\n"
+ " rid text COLLATE pg_catalog.\"default\" NOT NULL,\n"
+ " date text COLLATE pg_catalog.\"default\" NOT NULL,\n"
+ " metric_type text COLLATE pg_catalog.\"default\" NOT NULL,\n"
+ " count integer,\n"
+ " CONSTRAINT sushilog" + opendoar + "_pkey PRIMARY KEY (source, repository, rid, date, metric_type)\n"
+ " USING INDEX TABLESPACE index_storage\n"
+ ")\n"
+ "\n"
+ "TABLESPACE pg_default;\n"
+ "\n"
+ "ALTER TABLE shadow.sushilog" + opendoar + "\n"
+ " OWNER to sqoop;\n"
+ "\n"
+ "-- Rule: ignore_duplicate_inserts ON shadow.sushilog" + opendoar + "\n"
+ "\n"
+ "-- DROP Rule ignore_duplicate_inserts ON shadow.sushilog" + opendoar + ";\n"
+ "\n"
+ "CREATE OR REPLACE RULE ignore_duplicate_inserts AS\n"
+ " ON INSERT TO shadow.sushilog" + opendoar + "\n"
+ " WHERE (EXISTS ( SELECT sushilog" + opendoar + ".source,\n"
+ " sushilog" + opendoar + ".repository,\n"
+ " sushilog" + opendoar + ".rid,\n"
+ " sushilog" + opendoar + ".date\n"
+ " FROM sushilog" + opendoar + "\n"
+ " WHERE sushilog" + opendoar + ".source = new.source AND sushilog" + opendoar
+ ".repository = new.repository AND sushilog" + opendoar + ".rid = new.rid AND sushilog" + opendoar
+ ".date = new.date AND sushilog" + opendoar + ".metric_type = new.metric_type))\n"
+ " DO INSTEAD\n"
+ "NOTHING;";
Statement stCreateTables = ConnectDB.getConnection().createStatement();
stCreateTables.execute(createTablesQuery);
ConnectDB.getConnection().commit();
PreparedStatement preparedStatement = ConnectDB
.getConnection()
.prepareStatement(
"INSERT INTO sushilog" + opendoar
+ " (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)");
int batch_size = 0;
while (start.before(end)) {
// log.info("date: " + simpleDateFormat.format(start.getTime()));
String reportUrl = "https://irus.jisc.ac.uk/api/sushilite/v1_7/GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=2019-10-31&RepositoryIdentifier=opendoar%3A"
+ opendoar + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
start.add(Calendar.MONTH, 1);
String text = getJson(reportUrl, "", "");
if (text == null) {
continue;
}
JSONParser parser = new JSONParser();
JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("ReportResponse");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Report");
jsonObject = (JSONObject) jsonObject.get("Customer");
JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
if (jsonArray == null) {
continue;
}
String oai = "";
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
for (Object identifier : itemIdentifier) {
JSONObject oaiPmh = (JSONObject) identifier;
if (oaiPmh.get("Type").toString().equals("OAI")) {
oai = oaiPmh.get("Value").toString();
// System.out.println("OAI: " + oai);
break;
}
}
JSONArray itemPerformance = (JSONArray) jsonObjectRow.get("ItemPerformance");
String period;
String type;
String count;
for (Object perf : itemPerformance) {
JSONObject performance = (JSONObject) perf;
JSONObject periodObj = (JSONObject) performance.get("Period");
period = periodObj.get("Begin").toString();
JSONObject instanceObj = (JSONObject) performance.get("Instance");
type = instanceObj.get("MetricType").toString();
count = instanceObj.get("Count").toString();
// System.out.println(oai + " : " + period + " : " + count);
preparedStatement.setString(1, "IRUS-UK");
preparedStatement.setString(2, "opendoar____::" + opendoar);
preparedStatement.setString(3, oai);
preparedStatement.setString(4, period);
preparedStatement.setString(5, type);
preparedStatement.setInt(6, Integer.parseInt(count));
preparedStatement.addBatch();
batch_size++;
if (batch_size == 10000) {
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
batch_size = 0;
}
}
// break;
}
// break;
}
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
ConnectDB.getConnection().close();
}
private String getJson(String url, String username, String password) throws Exception {
// String cred=username+":"+password;
// String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
try {
URL website = new URL(url);
URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
return response.toString();
} catch (Exception e) {
log.error("Failed to get URL", e);
return null;
}
}
}

View File

@ -0,0 +1,132 @@
package eu.dnetlib.usagestats.export;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Calendar;
public class PiwikDownloadLogs {
private final String piwikUrl;
private Date startDate;
private final String tokenAuth;
/*
* The Piwik's API method
*/
private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
private final String format = "&format=json";
private final Logger log = Logger.getLogger(this.getClass());
public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
this.piwikUrl = piwikUrl;
this.tokenAuth = tokenAuth;
}
private String getPiwikLogUrl() {
return "https://" + piwikUrl + "/";
}
private String getJson(String url) throws Exception {
try {
URL website = new URL(url);
URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
return response.toString();
} catch (Exception e) {
log.error("Failed to get URL: " + e);
throw new Exception("Failed to get URL: " + e.toString(), e);
}
}
public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
Statement statement = ConnectDB.getConnection().createStatement();
ResultSet rs = statement.executeQuery("SELECT distinct piwik_id from public.datasource where piwik_id is not null order by piwik_id;");
while (rs.next()) {
int siteId = rs.getInt(1);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2016);
start.set(Calendar.MONTH, Calendar.MARCH);
//start.setTime(simpleDateFormat.parse("2016-01"));
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
PreparedStatement st = ConnectDB.DB_CONNECTION.prepareStatement("SELECT max(timestamp) FROM piwiklog WHERE source=? HAVING max(timestamp) is not null;");
st.setInt(1, siteId);
ResultSet rs_date = st.executeQuery();
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") && !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
}
}
rs_date.close();
for (Date date = start.getTime(); start.before(end); start.add(Calendar.DATE, 1), date = start.getTime()) {
log.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
String period = "&period=day&date=" + sdf.format(date);
String outFolder = "";
//portal siteId = 109;
if (siteId == Integer.parseInt(portalMatomoID)) {
outFolder = portalLogPath;
} else {
outFolder = repoLogsPath;
}
FileSystem fs = FileSystem.get(new Configuration());
FSDataOutputStream fin = fs.create(new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + ".json"), true);
String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
String content = "";
int i = 0;
while (!content.equals("[]\n")) {
String apiUrl = baseApiUrl;
if (i > 0) {
apiUrl += "&filter_offset=" + (i * 1000);
}
content = getJson(apiUrl);
fin.write(content.getBytes());
i++;
}
fin.close();
}
}
}

View File

@ -0,0 +1,56 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package eu.dnetlib.usagestats.export;
/**
*
* @author dpie
*/
/**
* @author dpie
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import org.json.JSONException;
import org.json.simple.JSONArray;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class ReadCounterRobotsList {
private ArrayList robotsPatterns = new ArrayList();
private String COUNTER_ROBOTS_URL;
public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException {
COUNTER_ROBOTS_URL = url;
robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL);
}
private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException {
InputStream is = new URL(url).openStream();
JSONParser parser = new JSONParser();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1")));
JSONArray jsonArray = (JSONArray) parser.parse(reader);
for (Object aJsonArray : jsonArray) {
org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray;
robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\"));
}
return robotsPatterns;
}
public ArrayList getRobotsPatterns() {
return robotsPatterns;
}
}

View File

@ -0,0 +1,255 @@
package eu.dnetlib.usagestats.export;
import java.io.*;
// import java.io.BufferedReader;
// import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import org.apache.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
/**
* Created by dpie
*/
public class SarcStats {
private Statement stmt = null;
private final Logger log = Logger.getLogger(this.getClass());
public SarcStats() throws Exception {
createTables();
}
private void createTables() throws Exception {
try {
stmt = ConnectDB.getConnection().createStatement();
String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
stmt.executeUpdate(sqlCreateTableSushiLog);
// String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
// stmt.executeUpdate(sqlCopyPublicSushiLog);
String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ " ON INSERT TO sushilog "
+ " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ "sushilog.rid, sushilog.date "
+ "FROM sushilog "
+ "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
stmt.executeUpdate(sqlcreateRuleSushiLog);
String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
stmt.executeUpdate(createSushiIndex);
stmt.close();
ConnectDB.getConnection().close();
log.info("Sushi Tables Created");
} catch (Exception e) {
log.error("Failed to create tables: " + e);
throw new Exception("Failed to create tables: " + e.toString(), e);
}
}
public void processSarc() throws Exception {
processARReport("https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X");
processARReport("https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X");
processARReport("https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335");
processARReport("https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030");
processARReport("https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781");
processARReport("https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529");
processARReport("https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027");
processARReport("https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474");
processARReport("https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099");
processARReport("https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187");
processARReport("https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X");
processARReport("https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799");
processARReport("https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098");
processARReport("https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754");
processARReport("https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794");
processARReport("https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826");
processARReport("https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015");
}
public void sarcStats() throws Exception {
stmt = ConnectDB.getConnection().createStatement();
ConnectDB.getConnection().setAutoCommit(false);
// String sql = "SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date)
// ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' INTO
// downloads_stats FROM sushilog s, datasource_oids d, result_oids ro WHERE s.repository=d.orid AND
// s.rid=ro.orid AND metric_type='ft_total'";
String sql = "INSERT INTO downloads_stats SELECT s.source, d.id AS repository_id, ro.id as result_id, extract('year' from s.date::date) ||'/'|| LPAD(CAST(extract('month' from s.date::date) AS VARCHAR), 2, '0') as date, s.count, '0' FROM sushilog s, public.datasource_oids d, public.datasource_results dr, public.result_pids ro WHERE d.orid LIKE '%' || s.repository || '%' AND dr.id=d.id AND dr.result=ro.id AND s.rid=ro.pid AND ro.type='doi' AND metric_type='ft_total' AND s.source='SARC-OJS';";
stmt.executeUpdate(sql);
stmt.close();
ConnectDB.getConnection().commit();
ConnectDB.getConnection().close();
}
public void processARReport(String url, String issn) throws Exception {
log.info("Processing SARC! issn: " + issn + " with url: " + url);
ConnectDB.getConnection().setAutoCommit(false);
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
Calendar start = Calendar.getInstance();
start.set(Calendar.YEAR, 2016);
start.set(Calendar.MONTH, Calendar.JANUARY);
// start.setTime(simpleDateFormat.parse("2016-01"));
Calendar end = Calendar.getInstance();
end.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
PreparedStatement st = ConnectDB
.getConnection()
.prepareStatement("SELECT max(date) FROM sushilog WHERE repository=?;");
st.setString(1, issn);
ResultSet rs_date = st.executeQuery();
while (rs_date.next()) {
if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
&& !rs_date.getString(1).equals("")) {
start.setTime(sdf.parse(rs_date.getString(1)));
}
}
rs_date.close();
PreparedStatement preparedStatement = ConnectDB
.getConnection()
.prepareStatement(
"INSERT INTO sushilog (source, repository, rid, date, metric_type, count) VALUES (?,?,?,?,?,?)");
int batch_size = 0;
while (start.before(end)) {
// String reportUrl =
// "http://irus.mimas.ac.uk/api/sushilite/v1_7/GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
// + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) +
// "&RepositoryIdentifier=opendoar%3A" + opendoar +
// "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
// System.out.println(reportUrl);
start.add(Calendar.MONTH, 1);
String text = getJson(reportUrl);
if (text == null) {
continue;
}
/*
* PrintWriter wr = new PrintWriter(new FileWriter("logs/" + simpleDateFormat.format(start.getTime()) +
* ".json")); wr.print(text); wr.close();
*/
JSONParser parser = new JSONParser();
JSONObject jsonObject = (JSONObject) parser.parse(text);
jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
jsonObject = (JSONObject) jsonObject.get("sc:Report");
if (jsonObject == null) {
continue;
}
jsonObject = (JSONObject) jsonObject.get("c:Report");
jsonObject = (JSONObject) jsonObject.get("c:Customer");
Object obj = jsonObject.get("c:ReportItems");
JSONArray jsonArray = new JSONArray();
if (obj instanceof JSONObject) {
jsonArray.add(obj);
} else {
jsonArray = (JSONArray) obj;
// jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
}
if (jsonArray == null) {
continue;
}
String rid = "";
for (Object aJsonArray : jsonArray) {
JSONObject jsonObjectRow = (JSONObject) aJsonArray;
JSONArray itemIdentifier = new JSONArray();
obj = jsonObjectRow.get("c:ItemIdentifier");
if (obj instanceof JSONObject) {
itemIdentifier.add(obj);
} else {
// JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("c:ItemIdentifier");
itemIdentifier = (JSONArray) obj;
}
for (Object identifier : itemIdentifier) {
JSONObject doi = (JSONObject) identifier;
if (doi.get("c:Type").toString().equals("DOI")) {
rid = doi.get("c:Value").toString();
// System.out.println("DOI: " + rid);
break;
}
}
if (rid.isEmpty()) {
continue;
}
JSONObject itemPerformance = (JSONObject) jsonObjectRow.get("c:ItemPerformance");
// for (Object perf : itemPerformance) {
JSONObject performance = (JSONObject) itemPerformance;
JSONObject periodObj = (JSONObject) performance.get("c:Period");
String period = periodObj.get("c:Begin").toString();
JSONObject instanceObj = (JSONObject) performance.get("c:Instance");
String type = instanceObj.get("c:MetricType").toString();
String count = instanceObj.get("c:Count").toString();
// System.out.println(rid + " : " + period + " : " + count);
preparedStatement.setString(1, "SARC-OJS");
preparedStatement.setString(2, issn);
// preparedStatement.setString(2, url);
preparedStatement.setString(3, rid);
preparedStatement.setString(4, period);
preparedStatement.setString(5, type);
preparedStatement.setInt(6, Integer.parseInt(count));
preparedStatement.addBatch();
batch_size++;
if (batch_size == 10000) {
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
batch_size = 0;
}
// }
// break;
}
// break;
}
preparedStatement.executeBatch();
ConnectDB.getConnection().commit();
ConnectDB.getConnection().close();
}
private String getJson(String url) {
// String cred=username+":"+password;
// String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
try {
URL website = new URL(url);
URLConnection connection = website.openConnection();
// connection.setRequestProperty ("Authorization", "Basic "+encoded);
StringBuilder response;
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
response = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
return response.toString();
} catch (Exception e) {
log.error("Failed to get URL: " + e);
// System.out.println("Failed to get URL: " + e);
return null;
// throw new Exception("Failed to get URL: " + e.toString(), e);
}
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.usagestats.export;
import java.io.InputStream;
import java.util.Properties;
import org.apache.log4j.Logger;
public class UsageStatsExporter {
private Logger log = Logger.getLogger(this.getClass());
private Properties properties;
public UsageStatsExporter(Properties properties) {
this.properties = properties;
}
// public void export() throws Exception {
public void export() throws Exception {
// read workdflow parameters
String matomoAuthToken = properties.getProperty("matomo_AuthToken");
String matomoBaseURL = properties.getProperty("matomo_BaseUrl");
String repoLogPath = properties.getProperty("repo_LogPath");
String portalLogPath = properties.getProperty("portal_LogPath");
String portalMatomoID = properties.getProperty("portal_MatomoID");
String irusUKBaseURL = properties.getProperty("IRUS_UK_BaseUrl");
// connect to DB
ConnectDB.init(properties);
PiwikDownloadLogs piwd = new PiwikDownloadLogs(matomoBaseURL, matomoAuthToken);
piwd.GetOpenAIRELogs(repoLogPath, portalLogPath, portalMatomoID);
/*
* Create DB tables, insert/update statistics
*/
PiwikStatsDB piwikstatsdb = new PiwikStatsDB(repoLogPath, portalLogPath);
piwikstatsdb.setCounterRobotsURL(properties.getProperty("COUNTER_robots_Url"));
piwikstatsdb.processLogs();
log.info("process logs done");
IrusStats irusstats = new IrusStats(irusUKBaseURL);
irusstats.processIrusRRReport();
irusstats.irusStats();
log.info("irus done");
SarcStats sarcStats = new SarcStats();
sarcStats.processSarc();
sarcStats.sarcStats();
log.info("sarc done");
// finalize usagestats
piwikstatsdb.finalizeStats();
log.info("finalized stats");
}
}

View File

@ -0,0 +1,43 @@
<html>
<head>
<title>Revision 58415: /dnet45/modules/dnet-openaire-usage-stats-export-wf/trunk/dnet-openaire-usage-stats-export/src/main/java/eu/dnetlib/usagestats/export</title>
</head>
<body>
<h2>Revision 58415: /dnet45/modules/dnet-openaire-usage-stats-export-wf/trunk/dnet-openaire-usage-stats-export/src/main/java/eu/dnetlib/usagestats/export</h2>
<ul>
<li>
<a href="../">..</a>
</li>
<li>
<a href="ConnectDB.java">ConnectDB.java</a>
</li>
<li>
<a href="ExecuteWorkflow.java">ExecuteWorkflow.java</a>
</li>
<li>
<a href="IrusStats.java">IrusStats.java</a>
</li>
<li>
<a href="PiwikDownloadLogs.java">PiwikDownloadLogs.java</a>
</li>
<li>
<a href="PiwikStatsDB.java">PiwikStatsDB.java</a>
</li>
<li>
<a href="ReadCounterRobotsList.java">ReadCounterRobotsList.java</a>
</li>
<li>
<a href="SarcStats.java">SarcStats.java</a>
</li>
<li>
<a href="UsageStatsExporter.java">UsageStatsExporter.java</a>
</li>
</ul>
<hr noshade>
<em>
Powered by
<a href="http://subversion.tigris.org/">Subversion</a>
version 1.4.4 (r25188).
</em>
</body>
</html>

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -0,0 +1,76 @@
<workflow-app name="graph_stats" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>
<description>the target stats database name</description>
</property>
<property>
<name>openaire_db_name</name>
<description>the original graph database name</description>
</property>
<property>
<name>external_stats_db_name</name>
<value>stats_ext</value>
<description>the external stats that should be added since they are not included in the graph database</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hiveMetastoreUris}</value>
</property>
</configuration>
</global>
<start to="Step1"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name='Step1'>
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<main-class>eu.dnetlib.oa.graph.usage-stats.export.UsageStatsExporter</main-class>
<main-class>org.apache.oozie.test.MyTest</main-class>
<arg>${outputFileName}</arg>
<capture-output/>
</java>
<ok to="End" />
<error to="fail" />
</action>
<action name="StepX">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hiveJdbcUrl}</jdbc-url>
<script>scripts/step1.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step2_1"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -26,6 +26,7 @@
<module>dhp-dedup-scholexplorer</module>
<module>dhp-graph-provision-scholexplorer</module>
<module>dhp-stats-update</module>
<module>dhp-usage-stats-update</module>
<module>dhp-broker-events</module>
</modules>