diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTablesSprint1.sql b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTablesSprint1.sql
new file mode 100644
index 000000000..29555b147
--- /dev/null
+++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/createIndicatorsTablesSprint1.sql
@@ -0,0 +1,198 @@
+create table TARGET.indi_pub_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentNonOA
+ from
+ (SELECT year, country, SUM(CASE
+ WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM SOURCE.publication p
+ join SOURCE.result_organization ro on p.id=ro.id
+ join SOURCE.organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table TARGET.indi_dataset_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentNonOA
+ from
+ (SELECT year, country, SUM(CASE
+ WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM SOURCE.dataset d
+ join SOURCE.result_organization ro on d.id=ro.id
+ join SOURCE.organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table TARGET.indi_software_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentNonOA
+ from
+ (SELECT year, country, SUM(CASE
+ WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM SOURCE.software s
+ join SOURCE.result_organization ro on s.id=ro.id
+ join SOURCER.organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+
+create table TARGET.indi_other_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as percentNonOA
+ from
+ (SELECT year, country, SUM(CASE
+ WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM SOURCE.otherresearchproduct orp
+ join SOURCE.result_organization ro on orp.id=ro.id
+ join SOURCE.organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table TARGET.indi_pub_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from SOURCE.publication_concepts pc
+join SOURCE.context c on pc.concept like concat('%',c.id,'%')
+join SOURCE.publication p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, round(no_of_pubs/total*100,3) percentageofpubs, name
+from total
+
+create table TARGET.indi_dataset_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from SOURCE.dataset_concepts pc
+join SOURCE.context c on pc.concept like concat('%',c.id,'%')
+join SOURCE.dataset p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, round(no_of_pubs/total*100,3) percentageofdataset, name
+from total
+
+create table TARGET.indi_software_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from SOURCE.software_concepts pc
+join SOURCE.context c on pc.concept like concat('%',c.id,'%')
+join SOURCE.software p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, round(no_of_pubs/total*100,3) percentageofsoftware, name
+from total
+
+create table TARGET.indi_other_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from SOURCE.otherresearchproduct_concepts pc
+join SOURCE.context c on pc.concept like concat('%',c.id,'%')
+join SOURCE.otherresearchproduct p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, round(no_of_pubs/total*100,3) percentageofother, name
+from total
+
+create table TARGET.indi_other_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from SOURCE.otherresearchproduct_datasources pd
+join SOURCE.datasource d on datasource=d.id
+join SOURCE.otherresearchproduct p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, round(no_of_pubs/total*100,3) percentageOfOtherresearchproduct, type
+from total
+
+create table TARGET.indi_software_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from SOURCE.software_datasources pd
+join SOURCE.datasource d on datasource=d.id
+join SOURCE.software p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, round(no_of_pubs/total*100,3) percentageOfSoftware, type
+from total
+
+create table TARGET.indi_dataset_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from SOURCE.dataset_datasources pd
+join SOURCE.datasource d on datasource=d.id
+join SOURCE.dataset p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, round(no_of_pubs/total*100,3) percentageOfDatasets, type
+from total
+
+create table TARGET.indi_pub_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from SOURCE.publication_datasources pd
+join SOURCE.datasource d on datasource=d.id
+join SOURCE.publication p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, round(no_of_pubs/total*100,3) percentageOfPubs, type
+from total
+
+create table TARGET.indi_pub_has_cc_licence_tr stored as parquet as
+select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license_tr
+from SOURCE.publication p
+left outer join (select p.id, license.type as lic from SOURCE.publication p
+join SOURCE.publication_licenses as license on license.id = p.id
+where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
+on p.id= tmp.id
+
+create table TARGET.indi_pub_has_cc_licence_url stored as parquet as
+select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url
+from SOURCE.publication p
+left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host
+from SOURCE.publication p
+join SOURCE.publication_licenses as license on license.id = p.id
+WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
+on p.id= tmp.id
+
+create table TARGET.indi_pub_has_cc_licence_f stored as parquet as
+select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license_f
+from SOURCE.publication p
+left outer join (select p.id, license.type as lic from SOURCE.publication p
+join SOURCE.publication_licenses as license on license.id = p.id
+where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
+on p.id= tmp.id
+
+create table TARGET.indi_pub_has_abstract stored as parquet as
+select distinct publication.id, coalesce(abstract, 1) has_abstract
+from SOURCE.publication
+
+compute stats TARGET.indi_pub_avg_year_country_oa;
+compute stats TARGET.indi_dataset_avg_year_country_oa;
+compute stats TARGET.indi_software_avg_year_country_oa;
+compute stats TARGET.indi_other_avg_year_country_oa;
+compute stats TARGET.indi_pub_avg_year_context_oa;
+compute stats TARGET.indi_dataset_avg_year_context_oa;
+compute stats TARGET.indi_software_avg_year_context_oa;
+compute stats TARGET.indi_other_avg_year_context_oa;
+compute stats TARGET.indi_other_avg_year_content_oa;
+compute stats TARGET.indi_software_avg_year_content_oa;
+compute stats TARGET.indi_dataset_avg_year_content_oa;
+compute stats TARGET.indi_pub_avg_year_content_oa;
+compute stats TARGET.indi_pub_has_cc_licence_tr;
+compute stats TARGET.indi_pub_has_cc_licence_url;
+compute stats TARGET.indi_pub_has_cc_licence_f;
+compute stats TARGET.indi_pub_has_abstract;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh
index 306609e8a..b979bf38e 100644
--- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh
+++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicators.sh
@@ -22,8 +22,8 @@ cat createIndicatorsTables.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala
echo "Indicators Database created"
-echo "Updating Shadow indicators DB"
-impala-shell -q "create database if not exists ${SHADOW}"
-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
-impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
-echo "Indicators Shadow DB ready!"
\ No newline at end of file
+#echo "Updating Shadow indicators DB"
+#impala-shell -q "create database if not exists ${SHADOW}"
+#impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
+#impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
+#echo "Indicators Shadow DB ready!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicatorsSprint1.sh b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicatorsSprint1.sh
new file mode 100644
index 000000000..fbff4e19d
--- /dev/null
+++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/scripts/indicatorsSprint1.sh
@@ -0,0 +1,27 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+ rm -Rf "$link_folder"
+ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export TARGET=$2
+export SHADOW=$3
+export SCRIPT_PATH=$4
+
+echo "Getting file from " $4
+hdfs dfs -copyToLocal $4
+
+echo "Creating indicators tables Sprint1"
+cat createIndicatorsTablesSprint1.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f -
+echo "Indicators Tables for Sprint 1 Created"
+
+
+echo "Updating Shadow indicators DB"
+impala-shell -q "drop database ${SHADOW}"
+impala-shell -q "create database if not exists ${SHADOW}"
+impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
+impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
+echo "Indicators Shadow DB ready!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml
index ec917b9a4..89f2e1c7d 100644
--- a/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/indicators/oozie_app/workflow.xml
@@ -90,9 +90,24 @@
${wf:appPath()}/scripts/createIndicatorsTables.sql
scripts/indicators.sh
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ indicatorsSprint1.sh
+ ${stats_db_name}
+ ${indicators_db_name}
+ ${indicators_shadow_db_name}
+ ${wf:appPath()}/scripts/createIndicatorsTablesSprint1.sql
+ scripts/indicatorsSprint1.sh
+
+
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
diff --git a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
index e89e2e5a4..dee0988cf 100644
--- a/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
+++ b/dhp-workflows/dhp-usage-datasets-stats-update/src/main/java/eu/dnetlib/oa/graph/datasetsusagestats/export/ReadReportsListFromDatacite.java
@@ -381,7 +381,7 @@ public class ReadReportsListFromDatacite {
logger.info("Created view datacite_downloads on permanent datasets usagestats DB");
stmt.close();
- ConnectDB.getHiveConnection().close();
+ // ConnectDB.getHiveConnection().close();
logger.info("Completed Building Permanent Datasets Usage Stats DB");
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/pom.xml b/dhp-workflows/dhp-usage-raw-data-update-beta/pom.xml
new file mode 100644
index 000000000..7590897f6
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/pom.xml
@@ -0,0 +1,91 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.4-SNAPSHOT
+
+ 4.0.0
+ dhp-usage-raw-data-update-beta
+
+
+
+ pl.project13.maven
+ git-commit-id-plugin
+ 2.1.15
+
+
+
+ revision
+
+
+
+
+ ${project.basedir}/../.git
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.6.1
+
+
+ 1.8
+
+
+
+
+
+ UTF-8
+ UTF-8
+ 0.13.1-cdh5.2.1
+ 2.5.0-cdh5.2.1
+
+
+
+
+ org.apache.spark
+ spark-core_2.11
+ 2.2.0
+
+
+ org.apache.spark
+ spark-sql_2.11
+ 2.4.5
+
+
+ com.googlecode.json-simple
+ json-simple
+ 1.1.1
+
+
+ org.json
+ json
+ 20180130
+ jar
+
+
+ org.apache.hive
+ hive-jdbc
+ ${cdh.hive.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${cdh.hadoop.version}
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ c3p0
+ c3p0
+ 0.9.1.2
+ jar
+
+
+ dhp-usage-raw-data-update-beta
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/runworkflow.sh b/dhp-workflows/dhp-usage-raw-data-update-beta/runworkflow.sh
new file mode 100755
index 000000000..64e6ea51a
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/runworkflow.sh
@@ -0,0 +1 @@
+mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdatabeta
\ No newline at end of file
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ConnectDB.java
new file mode 100644
index 000000000..5b5d82b87
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ConnectDB.java
@@ -0,0 +1,125 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Properties;
+
+import org.apache.log4j.Logger;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+
+public abstract class ConnectDB {
+
+ public static Connection DB_HIVE_CONNECTION;
+ public static Connection DB_IMPALA_CONNECTION;
+
+ private static String dbHiveUrl;
+ private static String dbImpalaUrl;
+ private static String usageStatsDBSchema;
+ private static String statsDBSchema;
+ private final static Logger log = Logger.getLogger(ConnectDB.class);
+
+ static void init() throws ClassNotFoundException {
+
+ dbHiveUrl = ExecuteWorkflow.dbHiveUrl;
+ dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl;
+ usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema;
+ statsDBSchema = ExecuteWorkflow.statsDBSchema;
+
+ Class.forName("org.apache.hive.jdbc.HiveDriver");
+ }
+
+ public static Connection getHiveConnection() throws SQLException {
+ if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) {
+ return DB_HIVE_CONNECTION;
+ } else {
+ DB_HIVE_CONNECTION = connectHive();
+
+ return DB_HIVE_CONNECTION;
+ }
+ }
+
+ public static Connection getImpalaConnection() throws SQLException {
+ if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) {
+ return DB_IMPALA_CONNECTION;
+ } else {
+ DB_IMPALA_CONNECTION = connectImpala();
+
+ return DB_IMPALA_CONNECTION;
+ }
+ }
+
+ public static String getUsageStatsDBSchema() {
+ return ConnectDB.usageStatsDBSchema;
+ }
+
+ public static String getStatsDBSchema() {
+ return ConnectDB.statsDBSchema;
+ }
+
+ private static Connection connectHive() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbHiveUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+ return cpds.getConnection();
+
+ }
+
+ private static Connection connectImpala() throws SQLException {
+ /*
+ * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt =
+ * connection.createStatement(); log.debug("Opened database successfully"); return connection;
+ */
+ ComboPooledDataSource cpds = new ComboPooledDataSource();
+ cpds.setJdbcUrl(dbImpalaUrl);
+ cpds.setAcquireIncrement(1);
+ cpds.setMaxPoolSize(100);
+ cpds.setMinPoolSize(1);
+ cpds.setInitialPoolSize(1);
+ cpds.setMaxIdleTime(300);
+ cpds.setMaxConnectionAge(36000);
+
+ cpds.setAcquireRetryAttempts(5);
+ cpds.setAcquireRetryDelay(2000);
+ cpds.setBreakAfterAcquireFailure(false);
+
+ cpds.setCheckoutTimeout(0);
+ cpds.setPreferredTestQuery("SELECT 1");
+ cpds.setIdleConnectionTestPeriod(60);
+
+ return cpds.getConnection();
+
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ExecuteWorkflow.java
new file mode 100644
index 000000000..42324ec6e
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ExecuteWorkflow.java
@@ -0,0 +1,215 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+import java.util.Date;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.BasicConfigurator;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class ExecuteWorkflow {
+
+ static String matomoAuthToken;
+ static String matomoBaseURL;
+ static String repoLogPath;
+ static String portalLogPath;
+ static String portalMatomoID;
+ static String irusUKBaseURL;
+ static String irusUKReportPath;
+ static String sarcsReportPathArray;
+ static String sarcsReportPathNonArray;
+ static String lareferenciaLogPath;
+ static String lareferenciaBaseURL;
+ static String lareferenciaAuthToken;
+ static String dbHiveUrl;
+ static String dbImpalaUrl;
+ static String usageStatsDBSchema;
+ static String statsDBSchema;
+ static boolean recreateDbAndTables;
+
+ static boolean piwikEmptyDirs;
+ static boolean downloadPiwikLogs;
+ static boolean processPiwikLogs;
+
+ static Calendar startingLogPeriod;
+ static Calendar endingLogPeriod;
+ static int numberOfPiwikIdsToDownload;
+ static int numberOfSiteIdsToDownload;
+
+ static boolean laReferenciaEmptyDirs;
+ static boolean downloadLaReferenciaLogs;
+ static boolean processLaReferenciaLogs;
+
+ static boolean irusCreateTablesEmptyDirs;
+ static boolean irusDownloadReports;
+ static boolean irusProcessStats;
+ static int irusNumberOfOpendoarsToDownload;
+
+ static boolean sarcCreateTablesEmptyDirs;
+ static boolean sarcDownloadReports;
+ static boolean sarcProcessStats;
+ static int sarcNumberOfIssnToDownload;
+
+ static boolean finalizeStats;
+
+ static int numberOfDownloadThreads;
+
+ static int b2SSHAREID;
+
+ public static void main(String args[]) throws Exception {
+
+ // Sending the logs to the console
+ BasicConfigurator.configure();
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ UsageStatsExporter.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/export/usagerawdata_parameters.json")));
+ parser.parseArgument(args);
+
+ // Setting up the initial parameters
+ matomoAuthToken = parser.get("matomoAuthToken");
+ matomoBaseURL = parser.get("matomoBaseURL");
+ repoLogPath = parser.get("repoLogPath");
+ portalLogPath = parser.get("portalLogPath");
+ portalMatomoID = parser.get("portalMatomoID");
+ irusUKBaseURL = parser.get("irusUKBaseURL");
+ irusUKReportPath = parser.get("irusUKReportPath");
+ sarcsReportPathArray = parser.get("sarcsReportPathArray");
+ sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray");
+ lareferenciaLogPath = parser.get("lareferenciaLogPath");
+ lareferenciaBaseURL = parser.get("lareferenciaBaseURL");
+ lareferenciaAuthToken = parser.get("lareferenciaAuthToken");
+
+ dbHiveUrl = parser.get("dbHiveUrl");
+ dbImpalaUrl = parser.get("dbImpalaUrl");
+ usageStatsDBSchema = parser.get("usageStatsDBSchema");
+ statsDBSchema = parser.get("statsDBSchema");
+
+ if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) {
+ recreateDbAndTables = true;
+ } else {
+ recreateDbAndTables = false;
+ }
+
+ if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) {
+ piwikEmptyDirs = true;
+ } else {
+ piwikEmptyDirs = false;
+ }
+
+ if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) {
+ downloadPiwikLogs = true;
+ } else {
+ downloadPiwikLogs = false;
+ }
+
+ if (parser.get("processPiwikLogs").toLowerCase().equals("true")) {
+ processPiwikLogs = true;
+ } else {
+ processPiwikLogs = false;
+ }
+
+ String startingLogPeriodStr = parser.get("startingLogPeriod");
+ Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr);
+ startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate);
+
+// String endingLogPeriodStr = parser.get("endingLogPeriod");
+// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr);
+// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate);
+
+ numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload"));
+ numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload"));
+
+ if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) {
+ laReferenciaEmptyDirs = true;
+ } else {
+ laReferenciaEmptyDirs = false;
+ }
+
+ if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) {
+ downloadLaReferenciaLogs = true;
+ } else {
+ downloadLaReferenciaLogs = false;
+ }
+
+ if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) {
+ processLaReferenciaLogs = true;
+ } else {
+ processLaReferenciaLogs = false;
+ }
+
+ if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) {
+ irusCreateTablesEmptyDirs = true;
+ } else {
+ irusCreateTablesEmptyDirs = false;
+ }
+
+ if (parser.get("irusDownloadReports").toLowerCase().equals("true")) {
+ irusDownloadReports = true;
+ } else {
+ irusDownloadReports = false;
+ }
+
+ if (parser.get("irusProcessStats").toLowerCase().equals("true")) {
+ irusProcessStats = true;
+ } else {
+ irusProcessStats = false;
+ }
+ irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload"));
+
+ if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) {
+ sarcCreateTablesEmptyDirs = true;
+ } else {
+ sarcCreateTablesEmptyDirs = false;
+ }
+
+ if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) {
+ sarcDownloadReports = true;
+ } else {
+ sarcDownloadReports = false;
+ }
+
+ if (parser.get("sarcProcessStats").toLowerCase().equals("true")) {
+ sarcProcessStats = true;
+ } else {
+ sarcProcessStats = false;
+ }
+ sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload"));
+
+ if (parser.get("finalizeStats").toLowerCase().equals("true")) {
+ finalizeStats = true;
+ } else {
+ finalizeStats = false;
+ }
+
+ numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads"));
+
+ b2SSHAREID = Integer.parseInt(parser.get("b2shareID"));
+
+ UsageStatsExporter usagestatsExport = new UsageStatsExporter();
+ usagestatsExport.export();
+ // usagestatsExport.createdDBWithTablesOnly();
+ }
+
+ private static Calendar startingLogPeriodStr(Date date) {
+
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(date);
+ return calendar;
+
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/IrusStats.java
new file mode 100644
index 000000000..f369602be
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/IrusStats.java
@@ -0,0 +1,358 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class IrusStats {
+
+ private String irusUKURL;
+
+ private static final Logger logger = LoggerFactory.getLogger(IrusStats.class);
+
+ public IrusStats(String irusUKURL) throws Exception {
+ this.irusUKURL = irusUKURL;
+ // The following may not be needed - It will be created when JSON tables are created
+// createTmpTables();
+ }
+
+ public void reCreateLogDirs() throws Exception {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true);
+
+ logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath));
+ }
+
+ public void createTables() throws Exception {
+ try {
+ logger.info("Creating sushilog");
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog(source STRING, "
+ + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableSushiLog);
+ logger.info("Created sushilog");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ public void processIrusStats() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping sushilogtmp_json table");
+ String dropSushilogtmpJson = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilogtmp_json";
+ stmt.executeUpdate(dropSushilogtmpJson);
+ logger.info("Dropped sushilogtmp_json table");
+
+ logger.info("Creating irus_sushilogtmp_json table");
+ String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " Type: STRING,\n"
+ + " Value: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` ARRAY<\n"
+ + " struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(createSushilogtmpJson);
+ logger.info("Created irus_sushilogtmp_json table");
+
+ logger.info("Dropping irus_sushilogtmp table");
+ String dropSushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(dropSushilogtmp);
+ logger.info("Dropped irus_sushilogtmp table");
+
+ logger.info("Creating irus_sushilogtmp table");
+ String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(createSushilogtmp);
+ logger.info("Created irus_sushilogtmp table");
+
+ logger.info("Inserting to irus_sushilogtmp table");
+ String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp "
+ + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), "
+ + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, "
+ + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf "
+ + "WHERE `ItemIdent`.`Type`= 'OAI'";
+ stmt.executeUpdate(insertSushilogtmp);
+ logger.info("Inserted to irus_sushilogtmp table");
+
+ logger.info("Inserting to sushilog table");
+ String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".irus_sushilogtmp";
+ stmt.executeUpdate(insertToShushilog);
+ logger.info("Inserted to sushilog table");
+
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getIrusRRReport(String irusUKReportPath) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+
+ logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime()));
+
+ String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime())
+ + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback=";
+
+ logger.info("(getIrusRRReport) Getting report: " + reportUrl);
+
+ String text = getJson(reportUrl, "", "");
+
+ List opendoarsToVisit = new ArrayList();
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ if (jsonArray != null) {
+ int i = 0;
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier");
+ for (Object identifier : itemIdentifier) {
+ JSONObject opendoar = (JSONObject) identifier;
+ if (opendoar.get("Type").toString().equals("OpenDOAR")) {
+ i++;
+ opendoarsToVisit.add(opendoar.get("Value").toString());
+ break;
+ }
+ }
+ // break;
+ }
+
+ logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit);
+
+ if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0
+ && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload);
+ }
+
+ logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit);
+
+ for (String opendoar : opendoarsToVisit) {
+ logger.info("Now working on openDoar: " + opendoar);
+ this.getIrusIRReport(opendoar, irusUKReportPath);
+ }
+ logger.info("(getIrusRRReport) Finished with report: " + reportUrl);
+ } else {
+ logger.info("IRUS Reports not found for day");
+ }
+
+ }
+
+ private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception {
+
+ logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar);
+
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, "opendoar____::" + opendoar);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+ int batch_size = 0;
+
+ if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar);
+ } else {
+ start.add(Calendar.MONTH, 1);
+ while (start.before(end)) {
+ logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime()));
+ String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime())
+ + "&RepositoryIdentifier=opendoar%3A" + opendoar
+ + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback=";
+ start.add(Calendar.MONTH, 1);
+
+ logger.info("Downloading file: " + reportUrl);
+ String text = getJson(reportUrl, "", "");
+ if (text == null) {
+ continue;
+ }
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ String filePath = irusUKReportPath + "/" + "IrusIRReport_"
+ + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePath);
+ FSDataOutputStream fin = fs.create(new Path(filePath), true);
+
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = (JSONObject) parser.parse(text);
+ jsonObject = (JSONObject) jsonObject.get("ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Report");
+ jsonObject = (JSONObject) jsonObject.get("Customer");
+ JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems");
+ if (jsonArray == null) {
+ continue;
+ }
+ String oai = "";
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRow.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
+
+ fin.close();
+ }
+
+ }
+ // ConnectDB.getHiveConnection().close();
+
+ logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar);
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ System.out.println("===> Connecting to: " + url);
+ URL website = new URL(url);
+ System.out.println("Connection url -----> " + url);
+ URLConnection connection = website.openConnection();
+
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+// response.append("\n");
+ }
+ }
+
+ System.out.println("response ====> " + response.toString());
+
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
+
+ private String getJson(String url, String username, String password) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL", e);
+ return null;
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaDownloadLogs.java
new file mode 100644
index 000000000..e64afae89
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaDownloadLogs.java
@@ -0,0 +1,273 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class LaReferenciaDownloadLogs {
+
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
+
+ /*
+ * The Piwik's API method
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+ private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess";
+
+ private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class);
+
+ public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+ this.createTables();
+// this.createTmpTables();
+ }
+
+ public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
+
+ logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
+ }
+
+ private void createTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialog "
+// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit,"
+// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id "
+// + "FROM lareferencialog "
+// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");";
+// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog);
+// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Lareferencia Tables Created");
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
+
+// private void createTmpTables() throws Exception {
+//
+// try {
+// Statement stmt = ConnectDB.getConnection().createStatement();
+// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
+// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+// + " ON INSERT TO lareferencialogtmp "
+// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit,"
+// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id "
+// + "FROM lareferencialogtmp "
+// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
+// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog);
+// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog);
+//
+// stmt.close();
+// log.info("Lareferencia Tmp Tables Created");
+//
+// } catch (Exception e) {
+// log.error("Failed to create tmptables: " + e);
+// throw new Exception("Failed to create tmp tables: " + e.toString(), e);
+// // System.exit(0);
+// }
+// }
+ private String getPiwikLogUrl() {
+ return piwikUrl + "/";
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+// response.append("\n");
+ }
+ }
+
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + e);
+ throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ }
+
+ public void GetLaReferenciaRepos(String repoLogsPath) throws Exception {
+
+ String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth;
+ String content = "";
+
+ List siteIdsToVisit = new ArrayList();
+
+ // Getting all the siteIds in a list for logging reasons & limiting the list
+ // to the max number of siteIds
+ content = getJson(baseApiUrl);
+ JSONParser parser = new JSONParser();
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString()));
+ }
+ logger.info("Found the following siteIds for download: " + siteIdsToVisit);
+
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
+
+ logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit);
+
+ for (int siteId : siteIdsToVisit) {
+ logger.info("Now working on LaReferencia MatomoId: " + siteId);
+ this.GetLaReFerenciaLogs(repoLogsPath, siteId);
+ }
+ }
+
+ public void GetLaReFerenciaLogs(String repoLogsPath,
+ int laReferencialMatomoID) throws Exception {
+
+ logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID);
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".lareferencialog WHERE matomoid=?");
+ st.setInt(1, laReferencialMatomoID);
+ Date dateMax = null;
+
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ Date date = currDay.getTime();
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger
+ .info(
+ "Date found in logs " + dateMax + " and not downloanding Matomo logs for "
+ + laReferencialMatomoID);
+ } else {
+ logger
+ .info(
+ "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for "
+ + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ outFolder = repoLogsPath;
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(
+ outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"),
+ true);
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ do {
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ fin.write(jsonObjectRaw.toJSONString().getBytes());
+ fin.writeChar('\n');
+ }
+
+ logger
+ .info(
+ "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID
+ + " and for "
+ + sdf.format(date));
+ i++;
+ } while (true);
+ fin.close();
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaStats.java
new file mode 100644
index 000000000..3c134f80e
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/LaReferenciaStats.java
@@ -0,0 +1,291 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Timestamp;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class LaReferenciaStats {
+
+ private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class);
+
+ private String logRepoPath;
+
+ private Statement stmt = null;
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public LaReferenciaStats(String logRepoPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.createTables();
+// this.createTmpTables();
+ }
+
+ /*
+ * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) {
+ * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } }
+ */
+ private void createTables() throws Exception {
+ try {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " +
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
+ "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Lareferencia Tables Created");
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ // System.exit(0);
+ }
+ }
+
+ public void processLogs() throws Exception {
+ try {
+ logger.info("Processing LaReferencia repository logs");
+ processlaReferenciaLog();
+ logger.info("LaReferencia repository logs process done");
+
+ logger.info("LaReferencia removing double clicks");
+ removeDoubleClicks();
+ logger.info("LaReferencia removed double clicks");
+
+ logger.info("LaReferencia updating Production Tables");
+ updateProdTables();
+ logger.info("LaReferencia updated Production Tables");
+
+ } catch (Exception e) {
+ logger.error("Failed to process logs: " + e);
+ throw new Exception("Failed to process logs: " + e.toString(), e);
+ }
+ }
+
+ public void processlaReferenciaLog() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping lareferencialogtmp_json table");
+ String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp_json";
+ stmt.executeUpdate(drop_lareferencialogtmp_json);
+ logger.info("Dropped lareferencialogtmp_json table");
+
+ logger.info("Creating lareferencialogtmp_json");
+ String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp_json(\n" +
+ " `idSite` STRING,\n" +
+ " `idVisit` STRING,\n" +
+ " `country` STRING,\n" +
+ " `referrerName` STRING,\n" +
+ " `browser` STRING,\n" +
+ " `repItem` STRING,\n" +
+ " `actionDetails` ARRAY<\n" +
+ " struct<\n" +
+ " timestamp: STRING,\n" +
+ " type: STRING,\n" +
+ " url: STRING,\n" +
+ " `customVariables`: struct<\n" +
+ " `1`: struct<\n" +
+ " `customVariablePageValue1`: STRING\n" +
+ " >,\n" +
+ " `2`: struct<\n" +
+ " `customVariablePageValue2`: STRING\n" +
+ " >\n" +
+ " >\n" +
+ " >\n" +
+ " >" +
+ ")\n" +
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" +
+ "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" +
+ "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_lareferencialogtmp_json);
+ logger.info("Created lareferencialogtmp_json");
+
+ logger.info("Dropping lareferencialogtmp table");
+ String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " +
+ ConnectDB.getUsageStatsDBSchema() +
+ ".lareferencialogtmp";
+ stmt.executeUpdate(drop_lareferencialogtmp);
+ logger.info("Dropped lareferencialogtmp table");
+
+ logger.info("Creating lareferencialogtmp");
+ String create_lareferencialogtmp = "CREATE TABLE " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " +
+ "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " +
+ "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " +
+ "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " +
+ "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_lareferencialogtmp);
+ logger.info("Created lareferencialogtmp");
+
+ logger.info("Inserting into lareferencialogtmp");
+ String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " +
+ "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " +
+ "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " +
+ "actiondetail.type as action, actiondetail.url as url, " +
+ "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " +
+ "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " +
+ "referrerName as referrer_name, browser as agent " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " +
+ "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_lareferencialogtmp);
+ logger.info("Inserted into lareferencialogtmp");
+
+ stmt.close();
+ }
+
+ public void removeDoubleClicks() throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning download double clicks");
+ // clean download double clicks
+ String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" +
+ "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " +
+ "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " +
+ ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " +
+ "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " +
+ "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " +
+ "AND p1.timestamp listHdfsDir(String dir) throws Exception {
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ // log.info("Found hdfs file " + fileName);
+ fileNames.add(fileName);
+ }
+ // hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath));
+ throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ if (!line.equals("[]")) {
+ sb.append(line);
+ }
+ // sb.append(line);
+ line = br.readLine();
+ }
+ result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ if (result.equals("")) {
+ result = "[]";
+ }
+
+ // fs.close();
+ } catch (Exception e) {
+ logger.error(e.getMessage());
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs.java
new file mode 100644
index 000000000..37ad2149c
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs.java
@@ -0,0 +1,332 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.Authenticator;
+import java.net.URL;
+import java.net.URLConnection;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class PiwikDownloadLogs {
+
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
+
+ /*
+ * The Piwik's API method
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class);
+
+ public PiwikDownloadLogs(String piwikUrl, String tokenAuth) {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+
+ }
+
+ private String getPiwikLogUrl() {
+ return "https://" + piwikUrl + "/";
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ logger.debug("Connecting to download the JSON: " + url);
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + url + " Exception: " + e);
+ throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
+ }
+ }
+
+ class WorkerThread implements Runnable {
+
+ private Calendar currDay;
+ private int siteId;
+ private String repoLogsPath;
+ private String portalLogPath;
+ private String portalMatomoID;
+
+ public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws IOException {
+ this.currDay = (Calendar) currDay.clone();
+ this.siteId = new Integer(siteId);
+ this.repoLogsPath = new String(repoLogsPath);
+ this.portalLogPath = new String(portalLogPath);
+ this.portalMatomoID = new String(portalMatomoID);
+ }
+
+ public void run() {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Start) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ try {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (End) Thread for "
+ + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId
+ + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath
+ + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID);
+ }
+
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
+
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
+
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
+
+ i++;
+ } while (true);
+
+ fs.close();
+ }
+ }
+
+ public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception {
+
+ Statement statement = ConnectDB.getHiveConnection().createStatement();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ ResultSet rs = statement
+ .executeQuery(
+ "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema()
+ + ".datasource where piwik_id is not null and piwik_id <> 0 and piwik_id <> 196 order by piwik_id");
+
+ // Getting all the piwikids in a list for logging reasons & limitting the list
+ // to the max number of piwikids
+ List piwikIdToVisit = new ArrayList();
+// while (rs.next()) {
+// piwikIdToVisit.add(rs.getInt(1));
+// }
+ piwikIdToVisit.add(231);
+ logger.info("Found the following piwikIds for download: " + piwikIdToVisit);
+
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
+ logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
+
+ logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit);
+
+ // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
+ for (int siteId : piwikIdToVisit) {
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ // end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+
+ logger.info("Now working on piwikId: " + siteId);
+
+ PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog WHERE source=?");
+ st.setInt(1, siteId);
+ Date dateMax = null;
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
+
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ // logger.info("Date used " + currDay.toString());
+ // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ // executor.execute(worker);// calling execute method of ExecutorService
+ logger.info("Date used " + currDay.getTime().toString());
+
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
+ } else {
+ GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ }
+
+ }
+ }
+ // executor.shutdown();
+ // while (!executor.isTerminated()) {
+ // }
+ // System.out.println("Finished all threads");
+ }
+
+ public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath,
+ String portalMatomoID) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = "";
+ if (siteId == Integer.parseInt(portalMatomoID)) {
+ outFolder = portalLogPath;
+ } else {
+ outFolder = repoLogsPath;
+ }
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
+
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
+
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
+
+ i++;
+ } while (true);
+
+ fs.close();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs_B2SHARE.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs_B2SHARE.java
new file mode 100644
index 000000000..c17b9b8d1
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikDownloadLogs_B2SHARE.java
@@ -0,0 +1,204 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos
+ */
+public class PiwikDownloadLogs_B2SHARE {
+
+ private final String piwikUrl;
+ private Date startDate;
+ private final String tokenAuth;
+
+ /*
+ * The Piwik's API method
+ */
+ private final String APImethod = "?module=API&method=Live.getLastVisitsDetails";
+ private final String format = "&format=json";
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs_B2SHARE.class);
+
+ public PiwikDownloadLogs_B2SHARE(String piwikUrl, String tokenAuth) {
+ this.piwikUrl = piwikUrl;
+ this.tokenAuth = tokenAuth;
+
+ }
+
+ private String getPiwikLogUrl() {
+ return "https://" + piwikUrl + "/";
+ }
+
+ private String getJson(String url) throws Exception {
+ try {
+ logger.debug("Connecting to download the JSON: " + url);
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+ logger.error("Failed to get URL: " + url + " Exception: " + e);
+ throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e);
+ }
+ }
+
+ public void GetOpenAIREB2SHARELogs(String repoLogsPath) throws Exception {
+
+ Statement statement = ConnectDB.getHiveConnection().createStatement();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ List piwikIdToVisit = new ArrayList();
+ piwikIdToVisit.add(ExecuteWorkflow.b2SSHAREID);
+ logger.info("B2SHARE piwikId for download: " + piwikIdToVisit);
+
+ if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0
+ && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) {
+ logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload);
+ }
+
+ logger.info("Downloading for the followins piwikIds: " + piwikIdToVisit);
+
+ // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads);
+ for (int siteId : piwikIdToVisit) {
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("Starting period for log download: " + sdf.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+ // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+ // end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ logger.info("Ending period for log download: " + sdf.format(end.getTime()));
+
+ logger.info("Now working on piwikId: " + siteId);
+
+ PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION
+ .prepareStatement(
+ "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog WHERE source=?");
+ st.setInt(1, siteId);
+ Date dateMax = null;
+ ResultSet rs_date = st.executeQuery();
+ while (rs_date.next()) {
+ logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId);
+
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) {
+ // logger.info("Date used " + currDay.toString());
+ // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID);
+ // executor.execute(worker);// calling execute method of ExecutorService
+ logger.info("Date used " + currDay.getTime().toString());
+
+ if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId);
+ } else {
+ GetOpenAIRELogsB2SHAREForDate(currDay, siteId, repoLogsPath);
+ }
+
+ }
+ }
+ // executor.shutdown();
+ // while (!executor.isTerminated()) {
+ // }
+ // System.out.println("Finished all threads");
+ }
+
+ public void GetOpenAIRELogsB2SHAREForDate(Calendar currDay, int siteId, String repoLogsPath) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+
+ Date date = currDay.getTime();
+ logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date));
+
+ String period = "&period=day&date=" + sdf.format(date);
+ String outFolder = repoLogsPath;
+
+ String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format
+ + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth;
+ String content = "";
+
+ int i = 0;
+
+ JSONParser parser = new JSONParser();
+ StringBuffer totalContent = new StringBuffer();
+ FileSystem fs = FileSystem.get(new Configuration());
+
+ do {
+ int writtenBytes = 0;
+ String apiUrl = baseApiUrl;
+
+ if (i > 0) {
+ apiUrl += "&filter_offset=" + (i * 1000);
+ }
+
+ content = getJson(apiUrl);
+ if (content.length() == 0 || content.equals("[]")) {
+ break;
+ }
+
+ FSDataOutputStream fin = fs
+ .create(
+ new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json"),
+ true);
+ JSONArray jsonArray = (JSONArray) parser.parse(content);
+ for (Object aJsonArray : jsonArray) {
+ JSONObject jsonObjectRaw = (JSONObject) aJsonArray;
+ byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes();
+ fin.write(jsonObjectRawBytes);
+ fin.writeChar('\n');
+
+ writtenBytes += jsonObjectRawBytes.length + 1;
+ }
+
+ fin.close();
+ System.out
+ .println(
+ Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes
+ + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i
+ + ".json");
+
+ i++;
+ } while (true);
+
+ fs.close();
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB.java
new file mode 100644
index 000000000..28ec3603c
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB.java
@@ -0,0 +1,878 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class PiwikStatsDB {
+
+ private String logPath;
+ private String logRepoPath;
+ private String logPortalPath;
+
+ private Statement stmt = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class);
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.logPortalPath = logPortalPath;
+
+ }
+
+ public void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
+
+ logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
+
+ logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
+
+ logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
+ }
+
+ public void recreateDBAndTables() throws Exception {
+ this.createDatabase();
+ this.createTables();
+ // The piwiklog table is not needed since it is built
+ // on top of JSON files
+ //////////// this.createTmpTables();
+ }
+
+ public ArrayList getRobotsList() {
+ return robotsList;
+ }
+
+ public void setRobotsList(ArrayList robotsList) {
+ this.robotsList = robotsList;
+ }
+
+ public String getCounterRobotsURL() {
+ return CounterRobotsURL;
+ }
+
+ public void setCounterRobotsURL(String CounterRobotsURL) {
+ this.CounterRobotsURL = CounterRobotsURL;
+ }
+
+ private void createDatabase() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+ String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE";
+ stmt.executeUpdate(dropDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to drop database: " + e);
+ throw new Exception("Failed to drop database: " + e.toString(), e);
+ }
+
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema());
+ String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema();
+ stmt.executeUpdate(createDatabase);
+
+ } catch (Exception e) {
+ logger.error("Failed to create database: " + e);
+ throw new Exception("Failed to create database: " + e.toString(), e);
+ }
+ }
+
+ private void createTables() throws Exception {
+ try {
+ stmt = ConnectDB.getHiveConnection().createStatement();
+
+ // Create Piwiklog table - This table should exist
+ String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) "
+ + "into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePiwikLog);
+
+// String dropT = "TRUNCATE TABLE "
+// + ConnectDB.getUsageStatsDBSchema()
+// + ".piwiklog ";
+// stmt.executeUpdate(dropT);
+// logger.info("truncated piwiklog");
+
+ /////////////////////////////////////////
+ // Rule for duplicate inserts @ piwiklog
+ /////////////////////////////////////////
+ String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTablePortalLog);
+
+ //////////////////////////////////////////////////
+ // Rule for duplicate inserts @ process_portal_log
+ //////////////////////////////////////////////////
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ public void processLogs() throws Exception {
+ try {
+ ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL());
+ this.robotsList = counterRobots.getRobotsPatterns();
+
+ logger.info("Processing repository logs");
+ processRepositoryLog();
+ logger.info("Repository logs process done");
+
+ logger.info("Removing double clicks");
+ removeDoubleClicks();
+ logger.info("Removing double clicks done");
+
+ logger.info("Cleaning oai");
+ cleanOAI();
+ logger.info("Cleaning oai done");
+
+ logger.info("Processing portal logs");
+ processPortalLog();
+ logger.info("Portal logs process done");
+
+ logger.info("Processing portal usagestats");
+ portalLogs();
+ logger.info("Portal usagestats process done");
+
+ logger.info("Updating Production Tables");
+ updateProdTables();
+ logger.info("Updated Production Tables");
+
+ logger.info("Create Pedocs Tables");
+ createPedocsOldUsageData();
+ logger.info("Pedocs Tables Created");
+
+ logger.info("Create Datacite Tables");
+ createDatasetsUsageData();
+ logger.info("Datacite Tables Created");
+
+ } catch (Exception e) {
+ logger.error("Failed to process logs: " + e);
+ throw new Exception("Failed to process logs: " + e.toString(), e);
+ }
+ }
+
+ public void processRepositoryLog() throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping piwiklogtmp_json table");
+ String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp_json";
+ stmt.executeUpdate(drop_piwiklogtmp_json);
+ logger.info("Dropped piwiklogtmp_json table");
+
+ logger.info("Creating piwiklogtmp_json");
+ String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp_json(\n"
+ + " `idSite` STRING,\n"
+ + " `idVisit` STRING,\n"
+ + " `country` STRING,\n"
+ + " `referrerName` STRING,\n"
+ + " `browser` STRING,\n"
+ + " `actionDetails` ARRAY<\n"
+ + " struct<\n"
+ + " type: STRING,\n"
+ + " url: STRING,\n"
+ + " `customVariables`: struct<\n"
+ + " `1`: struct<\n"
+ + " `customVariablePageValue1`: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " timestamp: String\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_piwiklogtmp_json);
+ logger.info("Created piwiklogtmp_json");
+
+ logger.info("Dropping piwiklogtmp table");
+ String drop_piwiklogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp";
+ stmt.executeUpdate(drop_piwiklogtmp);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Creating piwiklogtmp");
+ String create_piwiklogtmp = "CREATE TABLE "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_piwiklogtmp);
+ logger.info("Created piwiklogtmp");
+
+ logger.info("Inserting into piwiklogtmp");
+ String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, "
+ + "actiondetail.type as action, actiondetail.url as url, "
+ + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, "
+ + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, "
+ + "referrerName as referrer_name, browser as agent\n"
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n"
+ + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_piwiklogtmp);
+ logger.info("Inserted into piwiklogtmp");
+
+ stmt.close();
+ }
+
+ public void removeDoubleClicks() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning download double clicks");
+ // clean download double clicks
+ String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "WHERE EXISTS (\n"
+ + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, "
+ + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n"
+ + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n"
+ + "AND p1.timestamp\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_process_portal_log_tmp_json);
+ logger.info("Created process_portal_log_tmp_json");
+
+ logger.info("Droping process_portal_log_tmp table");
+ String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp";
+ stmt.executeUpdate(drop_process_portal_log_tmp);
+ logger.info("Dropped process_portal_log_tmp");
+
+ logger.info("Creating process_portal_log_tmp");
+ String create_process_portal_log_tmp = "CREATE TABLE "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_process_portal_log_tmp);
+ logger.info("Created process_portal_log_tmp");
+
+ logger.info("Inserting into process_portal_log_tmp");
+ String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".process_portal_log_tmp "
+ + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, "
+ + "actiondetail.url as url, "
+ + "CASE\n"
+ + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] "
+ + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] "
+ + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] "
+ + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] "
+ + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] "
+ + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] "
+ + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] "
+ + " ELSE '' "
+ + "END AS entity_id, "
+ + "CASE "
+ + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' "
+ + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' "
+ + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' "
+ + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' "
+ + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' "
+ + " ELSE '' "
+ + "END AS source_item_type, "
+ + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, "
+ + "browser as agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json "
+ + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_process_portal_log_tmp);
+ logger.info("Inserted into process_portal_log_tmp");
+
+ stmt.close();
+ }
+
+ public void portalLogs() throws SQLException {
+ Connection con = ConnectDB.getHiveConnection();
+ Statement stmt = con.createStatement();
+ con.setAutoCommit(false);
+
+ logger.info("PortalStats - Step 1");
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".result_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("PortalStats - Step 2");
+ stmt = con.createStatement();
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".datasource_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ /*
+ * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " +
+ * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " +
+ * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent "
+ * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " +
+ * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " +
+ * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() +
+ * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close();
+ */
+ logger.info("PortalStats - Step 3");
+ stmt = con.createStatement();
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp "
+ + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id "
+ + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema()
+ + ".project_oids roid WHERE roid.id IS NOT NULL)";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ con.close();
+ }
+
+ private void cleanOAI() throws Exception {
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning oai - Step 1");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/',"
+ + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 2");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/',"
+ + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 3");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/',"
+ + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 4");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/',"
+ + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 5");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/',"
+ + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 6");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/',"
+ + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 7");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/',"
+ + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 8");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/',"
+ + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 9");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/',"
+ + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 10");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/',"
+ + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 11");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/',"
+ + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 12");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/',"
+ + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 13");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/',"
+ + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 14");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/',"
+ + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 15");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/',"
+ + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 16");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/',"
+ + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 17");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/',"
+ + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 18");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/',"
+ + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 19");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/',"
+ + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 20");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/',"
+ + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 21");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/',"
+ + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 22");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/',"
+ + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 23");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/',"
+ + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 24");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/',"
+ + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 25");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/',"
+ + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 26");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/',"
+ + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 27");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/',"
+ + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 28");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/',"
+ + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Step 29");
+ stmt = ConnectDB.getHiveConnection().createStatement();
+ sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp "
+ + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/',"
+ + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'";
+ stmt.executeUpdate(sql);
+ stmt.close();
+
+ logger.info("Cleaning oai - Done, closing connection");
+ ConnectDB.getHiveConnection().close();
+ }
+
+ private void updateProdTables() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Inserting data to piwiklog");
+ String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog "
+ + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ stmt.executeUpdate(sql);
+
+ logger.info("Dropping piwiklogtmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Dropping process_portal_log_tmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped process_portal_log_tmp");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ }
+
+ public void finalizeStats() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Dropping piwiklogtmp");
+ String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Dropping process_portal_log_tmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped process_portal_log_tmp");
+
+ logger.info("Dropping irus_sushilogtmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped irus_sushilogtmp");
+
+ logger.info("Dropping irus_sushilogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped irus_sushilogtmp_json");
+
+ logger.info("Dropping lareferencialogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped lareferencialogtmp_json");
+
+ logger.info("Dropping piwiklogtmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogtmp_json");
+
+ logger.info("Dropping process_portal_log_tmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped process_portal_log_tmp_json");
+
+ logger.info("Dropping sarc_sushilogtmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp");
+
+ logger.info("Dropping sarc_sushilogtmp_json_array");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp_json_array");
+
+ logger.info("Dropping sarc_sushilogtmp_json_non_array");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped sarc_sushilogtmp_json_non_array");
+
+ logger.info("Dropping piwiklogb2sharetmp");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklogb2sharetmp");
+
+ logger.info("Dropping piwiklog_b2share_tmp_json");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog_b2share_tmp_json";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped piwiklog_b2share_tmp_json");
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+
+ }
+
+ private ArrayList listHdfsDir(String dir) throws Exception {
+
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ fileNames.add(fileName);
+ }
+
+ hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath));
+ throw new Exception("HDFS file path with exported data does not exist : " + logPath, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ if (!line.equals("[]")) {
+ sb.append(line);
+ }
+ // sb.append(line);
+ line = br.readLine();
+ }
+ result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ if (result.equals("")) {
+ result = "[]";
+ }
+
+ // fs.close();
+ } catch (Exception e) {
+ logger.error(e.getMessage());
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+ private Connection getConnection() throws SQLException {
+ return ConnectDB.getHiveConnection();
+ }
+
+ public void createPedocsOldUsageData() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating PeDocs Old Views Table");
+ String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsoldviews as select * from default.pedocsviews";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Views Table created");
+
+ logger.info("Creating PeDocs Old Downloads Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsolddownloads as select * from default.pedocsdownloads";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Downloads Table created");
+
+ }
+
+ public void createDatasetsUsageData() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Dropping datacite_views");
+ String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_views";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped datacite_views");
+
+ logger.info("Dropping datacite_downloads");
+ sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".datacite_downloads";
+ stmt.executeUpdate(sql);
+ logger.info("Dropped datacite_downloads");
+
+ logger.info("Creating Datasets Views Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".datacite_views as select * from openaire_prod_datacite_usage_stats.datacite_views";
+ stmt.executeUpdate(sql);
+ logger.info("Datasets Views Table created");
+
+ logger.info("Creating Datasets Downloads Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".datacite_downloads as select * from openaire_prod_datacite_usage_stats.datacite_downloads";
+ stmt.executeUpdate(sql);
+ logger.info("Datasets Downloads Table created");
+
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB_B2SHARE.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB_B2SHARE.java
new file mode 100644
index 000000000..8b85a3e90
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/PiwikStatsDB_B2SHARE.java
@@ -0,0 +1,304 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class PiwikStatsDB_B2SHARE {
+
+ private String logPath;
+ private String logRepoPath;
+ private String logPortalPath;
+
+ private Statement stmt = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB_B2SHARE.class);
+
+ private String CounterRobotsURL;
+ private ArrayList robotsList;
+
+ public PiwikStatsDB_B2SHARE(String logRepoPath, String logPortalPath) throws Exception {
+ this.logRepoPath = logRepoPath;
+ this.logPortalPath = logPortalPath;
+
+ }
+
+ public ArrayList getRobotsList() {
+ return robotsList;
+ }
+
+ public void setRobotsList(ArrayList robotsList) {
+ this.robotsList = robotsList;
+ }
+
+ public String getCounterRobotsURL() {
+ return CounterRobotsURL;
+ }
+
+ public void setCounterRobotsURL(String CounterRobotsURL) {
+ this.CounterRobotsURL = CounterRobotsURL;
+ }
+
+ public void processB2SHARELogs() throws Exception {
+ try {
+
+ logger.info("Processing B2SHARE logs");
+ processLog();
+ logger.info("B2SHARE logs process done");
+
+ logger.info("Removing double clicks from B2SHARE logs");
+ removeDoubleClicks();
+ logger.info("Removing double clicks from B2SHARE logs done");
+
+ logger.info("Updating Production Tables");
+ updateProdTables();
+ logger.info("Updated Production Tables");
+
+ } catch (Exception e) {
+ logger.error("Failed to process logs: " + e);
+ throw new Exception("Failed to process logs: " + e.toString(), e);
+ }
+ }
+
+ public void processLog() throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping piwiklog_b2share_tmp_json table");
+ String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog_b2share_tmp_json";
+ stmt.executeUpdate(drop_piwiklogtmp_json);
+ logger.info("Dropped piwiklog_b2share_tmp_json table");
+
+ logger.info("Creating piwiklog_b2share_tmp_json");
+ String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog_b2share_tmp_json(\n"
+ + " `idSite` STRING,\n"
+ + " `idVisit` STRING,\n"
+ + " `country` STRING,\n"
+ + " `referrerName` STRING,\n"
+ + " `browser` STRING,\n"
+ + " `actionDetails` ARRAY<\n"
+ + " struct<\n"
+ + " type: STRING,\n"
+ + " url: STRING,\n"
+ + " eventAction: STRING,\n"
+ + " eventName: STRING,\n"
+ + " timestamp: String\n"
+ + " >\n"
+ + " >\n"
+ + ")\n"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_piwiklogtmp_json);
+ logger.info("Created piwiklog_b2share_tmp_json");
+
+ logger.info("Dropping piwiklogtmp table");
+ String drop_piwiklogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogtmp";
+ stmt.executeUpdate(drop_piwiklogtmp);
+ logger.info("Dropped piwiklogtmp");
+
+ logger.info("Creating piwiklogb2sharetmp");
+ String create_piwiklogtmp = "CREATE TABLE "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklogb2sharetmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, "
+ + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_piwiklogtmp);
+ logger.info("Created piwiklogb2sharetmp");
+
+ logger.info("Inserting into piwiklogb2sharetmp");
+ String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp "
+ + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, "
+ + "actiondetail.eventAction as action, actiondetail.url as url, "
+ + "actiondetail.eventName as entity_id, "
+ + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, "
+ + "referrerName as referrer_name, browser as agent\n"
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog_b2share_tmp_json\n"
+ + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail";
+ stmt.executeUpdate(insert_piwiklogtmp);
+ logger.info("Inserted into piwiklogb2sharetmp");
+
+ stmt.close();
+ }
+
+ public void removeDoubleClicks() throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Cleaning download double clicks");
+ // clean download double clicks
+ String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp "
+ + "WHERE EXISTS (\n"
+ + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n"
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp p1, "
+ + ConnectDB.getUsageStatsDBSchema() + ".piwiklogb2sharetmp p2\n"
+ + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n"
+ + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n"
+ + "AND p1.timestamp listHdfsDir(String dir) throws Exception {
+
+ FileSystem hdfs = FileSystem.get(new Configuration());
+ RemoteIterator Files;
+ ArrayList fileNames = new ArrayList<>();
+
+ try {
+ Path exportPath = new Path(hdfs.getUri() + dir);
+ Files = hdfs.listFiles(exportPath, false);
+ while (Files.hasNext()) {
+ String fileName = Files.next().getPath().toString();
+ fileNames.add(fileName);
+ }
+
+ hdfs.close();
+ } catch (Exception e) {
+ logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath));
+ throw new Exception("HDFS file path with exported data does not exist : " + logPath, e);
+ }
+
+ return fileNames;
+ }
+
+ private String readHDFSFile(String filename) throws Exception {
+ String result;
+ try {
+
+ FileSystem fs = FileSystem.get(new Configuration());
+ // log.info("reading file : " + filename);
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
+
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ if (!line.equals("[]")) {
+ sb.append(line);
+ }
+ // sb.append(line);
+ line = br.readLine();
+ }
+ result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
+ if (result.equals("")) {
+ result = "[]";
+ }
+
+ // fs.close();
+ } catch (Exception e) {
+ logger.error(e.getMessage());
+ throw new Exception(e);
+ }
+
+ return result;
+ }
+
+ private Connection getConnection() throws SQLException {
+ return ConnectDB.getHiveConnection();
+ }
+
+ public void createPedocsOldUsageData() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating PeDocs Old Views Table");
+ String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsoldviews as select * from default.pedocsviews";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Views Table created");
+
+ logger.info("Creating PeDocs Old Downloads Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".pedocsolddownloads as select * from default.pedocsdownloads";
+ stmt.executeUpdate(sql);
+ logger.info("PeDocs Old Downloads Table created");
+
+ }
+
+ public void createDatasetsUsageData() throws SQLException {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating Datasets Views Table");
+ String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".datacite_views as select * from datasetsusagestats_20210301.datacite_views";
+ stmt.executeUpdate(sql);
+ logger.info("Datasets Views Table created");
+
+ logger.info("Creating Datasets Downloads Table");
+ sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".datacite_downloads as select * from datasetsusagestats_20210301.datacite_downloads";
+ stmt.executeUpdate(sql);
+ logger.info("Datasets Downloads Table created");
+
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ReadCounterRobotsList.java
new file mode 100644
index 000000000..8d0a0d6b3
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/ReadCounterRobotsList.java
@@ -0,0 +1,54 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+
+import org.json.JSONException;
+import org.json.simple.JSONArray;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+public class ReadCounterRobotsList {
+
+ private ArrayList robotsPatterns = new ArrayList();
+ private String COUNTER_ROBOTS_URL;
+
+ public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException {
+ COUNTER_ROBOTS_URL = url;
+ robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL);
+ }
+
+ private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException {
+ InputStream is = new URL(url).openStream();
+ JSONParser parser = new JSONParser();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1")));
+ JSONArray jsonArray = (JSONArray) parser.parse(reader);
+ for (Object aJsonArray : jsonArray) {
+ org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray;
+ robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\"));
+ }
+ return robotsPatterns;
+ }
+
+ public ArrayList getRobotsPatterns() {
+ return robotsPatterns;
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/SarcStats.java
new file mode 100644
index 000000000..ce17f6660
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/SarcStats.java
@@ -0,0 +1,500 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.*;
+// import java.io.BufferedReader;
+// import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class SarcStats {
+
+ private Statement stmtHive = null;
+ private Statement stmtImpala = null;
+
+ private static final Logger logger = LoggerFactory.getLogger(SarcStats.class);
+
+ public SarcStats() throws Exception {
+// createTables();
+ }
+
+ private void createTables() throws Exception {
+ try {
+
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));";
+ stmtHive.executeUpdate(sqlCreateTableSushiLog);
+
+ // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;";
+ // stmt.executeUpdate(sqlCopyPublicSushiLog);
+ String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS "
+ + " ON INSERT TO sushilog "
+ + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository,"
+ + "sushilog.rid, sushilog.date "
+ + "FROM sushilog "
+ + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;";
+ stmtHive.executeUpdate(sqlcreateRuleSushiLog);
+ String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);";
+ stmtHive.executeUpdate(createSushiIndex);
+
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+ } catch (Exception e) {
+ logger.error("Failed to create tables: " + e);
+ throw new Exception("Failed to create tables: " + e.toString(), e);
+ }
+ }
+
+ public void reCreateLogDirs() throws IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true);
+
+ logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true);
+
+ logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray));
+
+ logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray);
+ dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray));
+ }
+
+ public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Adding JSON Serde jar");
+ stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar");
+ logger.info("Added JSON Serde jar");
+
+ logger.info("Dropping sarc_sushilogtmp_json_array table");
+ String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_array);
+ logger.info("Dropped sarc_sushilogtmp_json_array table");
+
+ logger.info("Creating sarc_sushilogtmp_json_array table");
+ String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n"
+ + " `ItemIdentifier` ARRAY<\n"
+ + " struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >\n"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_array);
+ logger.info("Created sarc_sushilogtmp_json_array table");
+
+ logger.info("Dropping sarc_sushilogtmp_json_non_array table");
+ String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array);
+ logger.info("Dropped sarc_sushilogtmp_json_non_array table");
+
+ logger.info("Creating sarc_sushilogtmp_json_non_array table");
+ String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n"
+ + " `ItemIdentifier` struct<\n"
+ + " `Type`: STRING,\n"
+ + " `Value`: STRING\n"
+ + " >,\n"
+ + " `ItemPerformance` struct<\n"
+ + " `Period`: struct<\n"
+ + " `Begin`: STRING,\n"
+ + " `End`: STRING\n"
+ + " >,\n"
+ + " `Instance`: struct<\n"
+ + " `Count`: STRING,\n"
+ + " `MetricType`: STRING\n"
+ + " >\n"
+ + " >"
+ + ")"
+ + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n"
+ + "LOCATION '" + sarcsReportPathNonArray + "/'\n"
+ + "TBLPROPERTIES (\"transactional\"=\"false\")";
+ stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array);
+ logger.info("Created sarc_sushilogtmp_json_non_array table");
+
+ logger.info("Creating sarc_sushilogtmp table");
+ String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp(source STRING, repository STRING, "
+ + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc "
+ + "tblproperties('transactional'='true')";
+ stmt.executeUpdate(create_sarc_sushilogtmp);
+ logger.info("Created sarc_sushilogtmp table");
+
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+ String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array "
+ + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent "
+ + "WHERE `ItemIdent`.`Type`='DOI'";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)");
+
+ logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+ insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp "
+ + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], "
+ + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, "
+ + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` "
+ + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array";
+ stmt.executeUpdate(insert_sarc_sushilogtmp);
+ logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)");
+
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception {
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ logger.info("Creating sushilog table");
+ String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog "
+ + "(`source` string, "
+ + "`repository` string, "
+ + "`rid` string, "
+ + "`date` string, "
+ + "`metric_type` string, "
+ + "`count` int)";
+ stmt.executeUpdate(createSushilog);
+ logger.info("Created sushilog table");
+
+ logger.info("Dropping sarc_sushilogtmp table");
+ String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS "
+ + ConnectDB.getUsageStatsDBSchema()
+ + ".sarc_sushilogtmp";
+ stmt.executeUpdate(drop_sarc_sushilogtmp);
+ logger.info("Dropped sarc_sushilogtmp table");
+ ConnectDB.getHiveConnection().close();
+
+ List issnAndUrls = new ArrayList();
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030"
+ });
+ issnAndUrls.add(new String[] {
+ "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826"
+ });
+ issnAndUrls.add(new String[] {
+ "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015"
+ });
+
+ if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0
+ && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) {
+ logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload);
+ }
+
+ logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls);
+
+ for (String[] issnAndUrl : issnAndUrls) {
+ logger.info("Now working on ISSN: " + issnAndUrl[1]);
+ getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]);
+ }
+
+ }
+
+ public void updateSarcLogs() throws Exception {
+ stmtHive = ConnectDB.getHiveConnection().createStatement();
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+ stmtImpala = ConnectDB.getImpalaConnection().createStatement();
+
+ // Insert into sushilog
+ logger.info("Inserting into sushilog");
+ String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp";
+ stmtHive.executeUpdate(insertSushiLog);
+ logger.info("Inserted into sushilog");
+
+ stmtHive.close();
+ ConnectDB.getHiveConnection().close();
+ }
+
+ public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray,
+ String url, String issn) throws Exception {
+ logger.info("Processing SARC! issn: " + issn + " with url: " + url);
+ ConnectDB.getHiveConnection().setAutoCommit(false);
+
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM");
+ // Setting the starting period
+ Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone();
+ logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime()));
+
+ // Setting the ending period (last day of the month)
+// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone();
+// end.add(Calendar.MONTH, +1);
+// end.add(Calendar.DAY_OF_MONTH, -1);
+ Calendar end = Calendar.getInstance();
+ end.add(Calendar.DAY_OF_MONTH, -1);
+
+ logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime()));
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+ PreparedStatement st = ConnectDB
+ .getHiveConnection()
+ .prepareStatement(
+ "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?");
+ st.setString(1, issn);
+ ResultSet rs_date = st.executeQuery();
+ Date dateMax = null;
+ while (rs_date.next()) {
+ if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null")
+ && !rs_date.getString(1).equals("")) {
+ start.setTime(sdf.parse(rs_date.getString(1)));
+ dateMax = sdf.parse(rs_date.getString(1));
+ }
+ }
+ rs_date.close();
+
+ // Creating the needed configuration for the correct storing of data
+ Configuration config = new Configuration();
+ config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
+ config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
+ config
+ .set(
+ "fs.hdfs.impl",
+ org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ config
+ .set(
+ "fs.file.impl",
+ org.apache.hadoop.fs.LocalFileSystem.class.getName());
+ FileSystem dfs = FileSystem.get(config);
+
+ if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) {
+ logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn);
+ } else {
+ start.add(Calendar.MONTH, 1);
+ while (start.before(end)) {
+ String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate="
+ + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime());
+ start.add(Calendar.MONTH, 1);
+
+ logger.info("(getARReport) Getting report: " + reportUrl);
+ String text = getJson(reportUrl);
+ if (text == null) {
+ continue;
+ }
+
+ JSONParser parser = new JSONParser();
+ JSONObject jsonObject = null;
+ try {
+ jsonObject = (JSONObject) parser.parse(text);
+ } // if there is a parsing error continue with the next url
+ catch (ParseException pe) {
+ continue;
+ }
+
+ jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse");
+ jsonObject = (JSONObject) jsonObject.get("sc:Report");
+ if (jsonObject == null) {
+ continue;
+ }
+ jsonObject = (JSONObject) jsonObject.get("c:Report");
+ jsonObject = (JSONObject) jsonObject.get("c:Customer");
+ Object obj = jsonObject.get("c:ReportItems");
+ JSONArray jsonArray = new JSONArray();
+ if (obj instanceof JSONObject) {
+ jsonArray.add(obj);
+ } else {
+ jsonArray = (JSONArray) obj;
+ // jsonArray = (JSONArray) jsonObject.get("c:ReportItems");
+ }
+ if (jsonArray == null) {
+ continue;
+ }
+
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathArray);
+ FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true);
+
+ // Creating the file in the filesystem for the ItemIdentifier as array object
+ String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_"
+ + simpleDateFormat.format(start.getTime()) + ".json";
+ logger.info("Storing to file: " + filePathNonArray);
+ FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true);
+
+ for (Object aJsonArray : jsonArray) {
+
+ JSONObject jsonObjectRow = (JSONObject) aJsonArray;
+ renameKeysRecursively(":", jsonObjectRow);
+
+ if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) {
+ finNonArray.write(jsonObjectRow.toJSONString().getBytes());
+ finNonArray.writeChar('\n');
+ } else {
+ finArray.write(jsonObjectRow.toJSONString().getBytes());
+ finArray.writeChar('\n');
+ }
+ }
+
+ finArray.close();
+ finNonArray.close();
+
+ // Check the file size and if it is too big, delete it
+ File fileArray = new File(filePathArray);
+ if (fileArray.length() == 0) {
+ fileArray.delete();
+ }
+ File fileNonArray = new File(filePathNonArray);
+ if (fileNonArray.length() == 0) {
+ fileNonArray.delete();
+ }
+
+ }
+
+ dfs.close();
+ }
+ // ConnectDB.getHiveConnection().close();
+ }
+
+ private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception {
+ for (Object jjval : givenJsonObj) {
+ if (jjval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jjval);
+ } else if (jjval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jjval);
+ } // All other types of vals
+ else
+ ;
+ }
+ }
+
+ private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception {
+ Set jkeys = new HashSet(givenJsonObj.keySet());
+ for (String jkey : jkeys) {
+
+ String[] splitArray = jkey.split(delimiter);
+ String newJkey = splitArray[splitArray.length - 1];
+
+ Object jval = givenJsonObj.get(jkey);
+ givenJsonObj.remove(jkey);
+ givenJsonObj.put(newJkey, jval);
+
+ if (jval instanceof JSONObject) {
+ renameKeysRecursively(delimiter, (JSONObject) jval);
+ }
+
+ if (jval instanceof JSONArray) {
+ renameKeysRecursively(delimiter, (JSONArray) jval);
+ }
+ }
+ }
+
+ private String getJson(String url) throws Exception {
+ // String cred=username+":"+password;
+ // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes());
+ try {
+ URL website = new URL(url);
+ URLConnection connection = website.openConnection();
+ // connection.setRequestProperty ("Authorization", "Basic "+encoded);
+ StringBuilder response;
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
+ response = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ response.append("\n");
+ }
+ }
+ return response.toString();
+ } catch (Exception e) {
+
+ // Logging error and silently continuing
+ logger.error("Failed to get URL: " + e);
+ System.out.println("Failed to get URL: " + e);
+// return null;
+// throw new Exception("Failed to get URL: " + e.toString(), e);
+ }
+ return "";
+ }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/UsageStatsExporter.java
new file mode 100644
index 000000000..bec38573d
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/java/eu/dnetlib/oa/graph/usagerawdatabeta/export/UsageStatsExporter.java
@@ -0,0 +1,219 @@
+
+package eu.dnetlib.oa.graph.usagerawdatabeta.export;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Main class for downloading and processing Usage statistics
+ *
+ * @author D. Pierrakos, S. Zoupanos
+ */
+public class UsageStatsExporter {
+
+ public UsageStatsExporter() {
+
+ }
+
+ private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class);
+
+ private void reCreateLogDirs() throws IllegalArgumentException, IOException {
+ FileSystem dfs = FileSystem.get(new Configuration());
+
+ logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true);
+
+ logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true);
+
+ logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true);
+
+ logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath));
+
+ logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath));
+
+ logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath);
+ dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath));
+ }
+
+ public void export() throws Exception {
+
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
+
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+
+ logger.info("Re-creating database and tables");
+ if (ExecuteWorkflow.recreateDbAndTables) {
+ piwikstatsdb.recreateDBAndTables();
+ logger.info("DB-Tables-TmpTables are created ");
+ }
+
+ logger.info("Initializing the download logs module");
+ PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken);
+
+ if (ExecuteWorkflow.piwikEmptyDirs) {
+ logger.info("Recreating Piwik log directories");
+ piwikstatsdb.reCreateLogDirs();
+ }
+
+ // Downloading piwik logs (also managing directory creation)
+ if (ExecuteWorkflow.downloadPiwikLogs) {
+ logger.info("Downloading piwik logs");
+ piwd
+ .GetOpenAIRELogs(
+ ExecuteWorkflow.repoLogPath,
+ ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID);
+ }
+ logger.info("Downloaded piwik logs");
+
+ // Create DB tables, insert/update statistics
+ String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json";
+ piwikstatsdb.setCounterRobotsURL(cRobotsUrl);
+
+ if (ExecuteWorkflow.processPiwikLogs) {
+ logger.info("Processing logs");
+ piwikstatsdb.processLogs();
+ }
+
+ logger.info("Creating LaReferencia tables");
+ LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL,
+ ExecuteWorkflow.lareferenciaAuthToken);
+
+ if (ExecuteWorkflow.laReferenciaEmptyDirs) {
+ logger.info("Recreating LaReferencia log directories");
+ lrf.reCreateLogDirs();
+ }
+
+ if (ExecuteWorkflow.downloadLaReferenciaLogs) {
+ logger.info("Downloading LaReferencia logs");
+ lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath);
+ logger.info("Downloaded LaReferencia logs");
+ }
+
+ LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath);
+
+ if (ExecuteWorkflow.processLaReferenciaLogs) {
+ logger.info("Processing LaReferencia logs");
+ lastats.processLogs();
+ logger.info("LaReferencia logs done");
+ }
+
+ IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL);
+ if (ExecuteWorkflow.irusCreateTablesEmptyDirs) {
+ logger.info("Creating Irus Stats tables");
+ irusstats.createTables();
+ logger.info("Created Irus Stats tables");
+
+ logger.info("Re-create log dirs");
+ irusstats.reCreateLogDirs();
+ logger.info("Re-created log dirs");
+ }
+
+ if (ExecuteWorkflow.irusDownloadReports) {
+ irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath);
+ }
+
+ if (ExecuteWorkflow.irusProcessStats) {
+ irusstats.processIrusStats();
+ logger.info("Irus done");
+ }
+
+ SarcStats sarcStats = new SarcStats();
+ if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) {
+ sarcStats.reCreateLogDirs();
+ }
+ if (ExecuteWorkflow.sarcDownloadReports) {
+ sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ }
+
+ if (ExecuteWorkflow.sarcProcessStats) {
+ sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray);
+ sarcStats.updateSarcLogs();
+ }
+ logger.info("Sarc done");
+
+ PiwikDownloadLogs_B2SHARE b2sharePiwikID = new PiwikDownloadLogs_B2SHARE(ExecuteWorkflow.matomoBaseURL,
+ ExecuteWorkflow.matomoAuthToken);
+ b2sharePiwikID.GetOpenAIREB2SHARELogs(ExecuteWorkflow.repoLogPath);
+ logger.info("B2SHARE done");
+
+ PiwikStatsDB_B2SHARE piwikstatsB2SHAREdb = new PiwikStatsDB_B2SHARE(ExecuteWorkflow.repoLogPath,
+ ExecuteWorkflow.portalLogPath);
+ piwikstatsB2SHAREdb.setCounterRobotsURL(cRobotsUrl);
+
+ logger.info("Processing B2SHARE logs");
+ piwikstatsB2SHAREdb.processB2SHARELogs();
+
+ // finalize usagestats
+ logger.info("Dropping tmp tables");
+ if (ExecuteWorkflow.finalizeStats) {
+ piwikstatsdb.finalizeStats();
+ logger.info("Dropped tmp tables");
+ }
+
+ logger.info("Raw Data Download End");
+ }
+
+ public void createdDBWithTablesOnly() throws Exception {
+ logger.info("Initialising DB properties");
+ ConnectDB.init();
+
+ PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath);
+ piwikstatsdb.recreateDBAndTables();
+
+ piwikstatsdb.createPedocsOldUsageData();
+
+ Statement stmt = ConnectDB.getHiveConnection().createStatement();
+
+ logger.info("Creating LaReferencia tables");
+ String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS "
+ + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, "
+ + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, "
+ + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) "
+ + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets "
+ + "stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableLareferenciaLog);
+ logger.info("Created LaReferencia tables");
+
+ logger.info("Creating sushilog");
+
+ String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog(source STRING, "
+ + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, "
+ + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')";
+ stmt.executeUpdate(sqlCreateTableSushiLog);
+ logger.info("Created sushilog");
+
+ logger.info("Updating piwiklog");
+ String sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".piwiklog select * from openaire_prod_usage_raw.piwiklog";
+ stmt.executeUpdate(sql);
+
+ logger.info("Updating lareferencialog");
+ sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog";
+ stmt.executeUpdate(sql);
+
+ logger.info("Updating sushilog");
+ sql = "insert into " + ConnectDB.getUsageStatsDBSchema()
+ + ".sushilog select * from openaire_prod_usage_raw.sushilog";
+ stmt.executeUpdate(sql);
+
+ stmt.close();
+ ConnectDB.getHiveConnection().close();
+ logger.info("Sushi Tables Created");
+
+ }
+
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/export/usagerawdata_parameters.json
new file mode 100644
index 000000000..8c733c55b
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/export/usagerawdata_parameters.json
@@ -0,0 +1,225 @@
+[
+ {
+ "paramName": "mat",
+ "paramLongName": "matomoAuthToken",
+ "paramDescription": "when true will stop SparkSession after job execution",
+ "paramRequired": false
+ },
+ {
+ "paramName": "mbu",
+ "paramLongName": "matomoBaseURL",
+ "paramDescription": "URL of the isLookUp Service",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rlp",
+ "paramLongName": "repoLogPath",
+ "paramDescription": "nameNode of the source cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plp",
+ "paramLongName": "portalLogPath",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pmi",
+ "paramLongName": "portalMatomoID",
+ "paramDescription": "namoNode of the target cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "iukbuw",
+ "paramLongName": "irusUKBaseURL",
+ "paramDescription": "working directory",
+ "paramRequired": true
+ },
+ {
+ "paramName": "iukrp",
+ "paramLongName": "irusUKReportPath",
+ "paramDescription": "maximum number of map tasks used in the distcp process",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpa",
+ "paramLongName": "sarcsReportPathArray",
+ "paramDescription": "memory for distcp action copying actionsets from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "srpna",
+ "paramLongName": "sarcsReportPathNonArray",
+ "paramDescription": "timeout for distcp copying actions from remote cluster",
+ "paramRequired": true
+ },
+ {
+ "paramName": "llp",
+ "paramLongName": "lareferenciaLogPath",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lbu",
+ "paramLongName": "lareferenciaBaseURL",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lat",
+ "paramLongName": "lareferenciaAuthToken",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbhu",
+ "paramLongName": "dbHiveUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dbiu",
+ "paramLongName": "dbImpalaUrl",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "usdbs",
+ "paramLongName": "usageStatsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "sdbs",
+ "paramLongName": "statsDBSchema",
+ "paramDescription": "activate tranform-only mode. Only apply transformation step",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rdbt",
+ "paramLongName": "recreateDbAndTables",
+ "paramDescription": "Re-create database and initial tables?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pwed",
+ "paramLongName": "piwikEmptyDirs",
+ "paramDescription": "Empty piwik directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ppwl",
+ "paramLongName": "processPiwikLogs",
+ "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dpwl",
+ "paramLongName": "downloadPiwikLogs",
+ "paramDescription": "download piwik logs?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "slp",
+ "paramLongName": "startingLogPeriod",
+ "paramDescription": "Starting log period",
+ "paramRequired": true
+ },
+ {
+ "paramName": "npidd",
+ "paramLongName": "numberOfPiwikIdsToDownload",
+ "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nsidd",
+ "paramLongName": "numberOfSiteIdsToDownload",
+ "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "lerd",
+ "paramLongName": "laReferenciaEmptyDirs",
+ "paramDescription": "Empty LaReferencia directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "plrl",
+ "paramLongName": "processLaReferenciaLogs",
+ "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dlrl",
+ "paramLongName": "downloadLaReferenciaLogs",
+ "paramDescription": "download La Referencia logs?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "icted",
+ "paramLongName": "irusCreateTablesEmptyDirs",
+ "paramDescription": "Irus section: Create tables and empty JSON directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "idr",
+ "paramLongName": "irusDownloadReports",
+ "paramDescription": "Irus section: Download reports?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "irusProcessStats",
+ "paramDescription": "Irus section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "inod",
+ "paramLongName": "irusNumberOfOpendoarsToDownload",
+ "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload",
+ "paramRequired": true
+ },
+ {
+ "paramName": "icted",
+ "paramLongName": "sarcCreateTablesEmptyDirs",
+ "paramDescription": "Sarc section: Create tables and empty JSON directories?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "idr",
+ "paramLongName": "sarcDownloadReports",
+ "paramDescription": "Sarc section: Download reports?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ipr",
+ "paramLongName": "sarcProcessStats",
+ "paramDescription": "Sarc section: Process stats?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "inod",
+ "paramLongName": "sarcNumberOfIssnToDownload",
+ "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "fs",
+ "paramLongName": "finalizeStats",
+ "paramDescription": "Create the usage_stats table?",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nodt",
+ "paramLongName": "numberOfDownloadThreads",
+ "paramDescription": "Number of download threads",
+ "paramRequired": true
+ },
+ {
+ "paramName": "b2shareID",
+ "paramLongName": "b2shareID",
+ "paramDescription": "B2SHARE Matomo ID",
+ "paramRequired": true
+ }
+]
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/config-default.xml
new file mode 100644
index 000000000..b5c807378
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/config-default.xml
@@ -0,0 +1,38 @@
+
+
+ jobTracker
+ ${jobTracker}
+
+
+ nameNode
+ ${nameNode}
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hiveMetastoreUris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ hiveJdbcUrl
+ jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1
+
+
+ impalaJdbcUrl
+ jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl;
+
+
+ oozie.wf.workflow.notification.url
+ {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status
+
+
+ oozie.use.system.libpath
+ true
+
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/workflow.xml
new file mode 100644
index 000000000..44b6b9dde
--- /dev/null
+++ b/dhp-workflows/dhp-usage-raw-data-update-beta/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdatabeta/oozie_app/workflow.xml
@@ -0,0 +1,89 @@
+
+
+
+ hiveMetastoreUris
+ Hive server metastore URIs
+
+
+ hiveJdbcUrl
+ Hive server jdbc url
+
+
+ impalaJdbcUrl
+ Impala server jdbc url
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ hive.metastore.uris
+ ${hiveMetastoreUris}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ eu.dnetlib.oa.graph.usagerawdatabeta.export.ExecuteWorkflow
+ --matomoAuthToken${matomoAuthToken}
+ --matomoBaseURL${matomoBaseURL}
+ --repoLogPath${repoLogPath}
+ --portalLogPath${portalLogPath}
+ --portalMatomoID${portalMatomoID}
+ --irusUKBaseURL${irusUKBaseURL}
+ --irusUKReportPath${irusUKReportPath}
+ --sarcsReportPathArray${sarcsReportPathArray}
+ --sarcsReportPathNonArray${sarcsReportPathNonArray}
+ --lareferenciaLogPath${lareferenciaLogPath}
+ --lareferenciaBaseURL${lareferenciaBaseURL}
+ --lareferenciaAuthToken${lareferenciaAuthToken}
+ --dbHiveUrl${hiveJdbcUrl}
+ --dbImpalaUrl${impalaJdbcUrl}
+ --usageStatsDBSchema${usageStatsDBSchema}
+ --statsDBSchema${statsDBSchema}
+ --recreateDbAndTables${recreateDbAndTables}
+ --piwikEmptyDirs${piwikEmptyDirs}
+ --downloadPiwikLogs${downloadPiwikLogs}
+ --processPiwikLogs${processPiwikLogs}
+ --startingLogPeriod${startingLogPeriod}
+ --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload}
+ --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload}
+ --laReferenciaEmptyDirs${laReferenciaEmptyDirs}
+ --downloadLaReferenciaLogs${downloadLaReferenciaLogs}
+ --processLaReferenciaLogs${processLaReferenciaLogs}
+ --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs}
+ --irusDownloadReports${irusDownloadReports}
+ --irusProcessStats${irusProcessStats}
+ --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload}
+ --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs}
+ --sarcDownloadReports${sarcDownloadReports}
+ --sarcProcessStats${sarcProcessStats}
+ --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload}
+ --finalizeStats${finalizeStats}
+ --numberOfDownloadThreads${numberOfDownloadThreads}
+ --b2shareID${b2shareID}
+
+
+
+
+
+
+
+
diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
index 00378ca1f..f5d50516e 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
+++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java
@@ -686,6 +686,12 @@ public class PiwikStatsDB {
+ "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);
+ logger.info("Inserting ALcala data to piwiklog");
+ sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog "
+ + "SELECT * FROM openaire_beta_usage_raw.piwiklog";
+ stmt.executeUpdate(sql);
+ logger.info("Inserted ALcala data to piwiklog");
+
logger.info("Dropping piwiklogtmp");
sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp";
stmt.executeUpdate(sql);