Compare commits

..

No commits in common. "main" and "openorgswf" have entirely different histories.

2248 changed files with 67178 additions and 301903 deletions

6
.gitignore vendored
View File

@ -3,6 +3,8 @@
*.iws
*.ipr
*.iml
*.ipr
*.iws
*~
.vscode
.metals
@ -25,6 +27,4 @@ spark-warehouse
/**/job-override.properties
/**/*.log
/**/.factorypath
/**/.scalafmt.conf
/.java-version
/dhp-shade-package/dependency-reduced-pom.xml

View File

@ -1,21 +0,0 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

757
100.patch Normal file
View File

@ -0,0 +1,757 @@
From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Thu, 4 Mar 2021 00:42:21 +0200
Subject: [PATCH 1/8] Contexts are now downloaded instead of using the
stats_ext db
---
.../dhp/oa/graph/stats/oozie_app/contexts.sh | 33 +++++++++++++++++++
.../graph/stats/oozie_app/scripts/step10.sql | 13 --------
.../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++
3 files changed, 50 insertions(+), 13 deletions(-)
create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
new file mode 100644
index 00000000..f06a43bb
--- /dev/null
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+CONTEXT_API=$1
+TARGET_DB=$2
+
+TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
+
+echo "Downloading context data"
+curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
+cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
+
+echo "uploading context data to hdfs"
+hdfs dfs -mkdir ${TMP}
+hdfs dfs -copyFromLocal contexts.csv ${TMP}
+hdfs dfs -copyFromLocal categories.csv ${TMP}
+hdfs dfs -copyFromLocal concepts.csv ${TMP}
+hdfs dfs -chmod -R 777 ${TMP}
+
+echo "Creating and populating impala tables"
+impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
+impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
+impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
+
+echo "Cleaning up"
+hdfs dfs -rm -f -r -skipTrash ${TMP}
+
+echo "Finito!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
index 6c96317e..77fbd3b1 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
@@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT *
FROM ${external_stats_db_name}.rndexpediture;
-CREATE OR REPLACE VIEW ${stats_db_name}.context AS
-SELECT *
-FROM ${external_stats_db_name}.context;
-
-CREATE OR REPLACE VIEW ${stats_db_name}.category AS
-SELECT *
-FROM ${external_stats_db_name}.category;
-
-CREATE OR REPLACE VIEW ${stats_db_name}.concept AS
-SELECT *
-FROM ${external_stats_db_name}.concept;
-
-
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Creation date of the database
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
index 9c16f149..afb10c41 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@@ -41,6 +41,10 @@
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
+ <property>
+ <name>context_api_url</name>
+ <description>the base url of the context api (https://services.openaire.eu/openaire)</description>
+ </property>
</parameters>
<global>
@@ -263,6 +267,19 @@
<ok to="Step19"/>
<error to="Kill"/>
</action>
+
+ <action name="Step17">
+ <shell xmlns="uri:oozie:shell-action:0.1">
+ <job-tracker>${jobTracker}</job-tracker>
+ <name-node>${nameNode}</name-node>
+ <exec>contexts.sh</exec>
+ <argument>${context_api_url}</argument>
+ <argument>${stats_db_name}</argument>
+ <file>contexts.sh</file>
+ </shell>
+ <ok to="step20-createMonitorDB"/>
+ <error to="Kill"/>
+ </action>
<action name="Step19">
<shell xmlns="uri:oozie:shell-action:0.1">
--
2.17.1
From 6147ee495053634436abe822aaf9ba909813d8c4 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Fri, 5 Mar 2021 14:12:18 +0200
Subject: [PATCH 2/8] assigning correctly hive contexts to concepts
---
.../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 7 +++++--
.../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 5 ++++-
.../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 5 ++++-
.../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 5 ++++-
.../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 5 ++++-
5 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
index f06a43bb..6788f88b 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@@ -9,8 +9,8 @@ echo "Downloading context data"
curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
-cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
-cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
+cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
+cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
echo "uploading context data to hdfs"
hdfs dfs -mkdir ${TMP}
@@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.
echo "Cleaning up"
hdfs dfs -rm -f -r -skipTrash ${TMP}
+rm concepts.csv
+rm categories.csv
+rm contexts.csv
echo "Finito!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
index 62a15856..75b24b18 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_concepts AS
-SELECT substr(p.id, 4) as id, contexts.context.id as concept
+SELECT substr(p.id, 4) as id, case
+ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
index dcd5ad85..540cc03a 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_concepts AS
-SELECT substr(p.id, 4) as id, contexts.context.id as concept
+SELECT substr(p.id, 4) as id, case
+ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
index fd5390e6..54345e07 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_concepts AS
-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
+SELECT substr(p.id, 4) as id, case
+ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
index b359b596..36ad5d92 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.
where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
+SELECT substr(p.id, 4) as id, case
+ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
--
2.17.1
From f40c150a0d549e2dbcfd42ecf81e17ad4b505391 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Sat, 6 Mar 2021 00:35:57 +0200
Subject: [PATCH 3/8] fixed steps...
---
.../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
index afb10c41..2184cb8a 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@@ -264,7 +264,7 @@
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
- <ok to="Step19"/>
+ <ok to="Step17"/>
<error to="Kill"/>
</action>
@@ -277,7 +277,7 @@
<argument>${stats_db_name}</argument>
<file>contexts.sh</file>
</shell>
- <ok to="step20-createMonitorDB"/>
+ <ok to="step19"/>
<error to="Kill"/>
</action>
--
2.17.1
From fa1ec5b5e9b6038b3b565422af5c6406f21220d3 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Wed, 10 Mar 2021 14:05:58 +0200
Subject: [PATCH 4/8] fixed typo...
---
.../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
index 2184cb8a..321500e2 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@@ -277,7 +277,7 @@
<argument>${stats_db_name}</argument>
<file>contexts.sh</file>
</shell>
- <ok to="step19"/>
+ <ok to="Step19"/>
<error to="Kill"/>
</action>
--
2.17.1
From 3c75a050443942b632cf8469b5af16a8c61e7569 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Fri, 12 Mar 2021 13:47:04 +0200
Subject: [PATCH 5/8] fixed a ton of typos
---
.../scripts/computeProductionStats.sql | 8 -------
.../stats/oozie_app/updateProductionViews.sh | 18 ++++++++++++++++
.../dhp/oa/graph/stats/oozie_app/contexts.sh | 21 ++++++++++++-------
3 files changed, 32 insertions(+), 15 deletions(-)
delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
deleted file mode 100644
index 34e48a18..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
+++ /dev/null
@@ -1,8 +0,0 @@
-------------------------------------------------------
-------------------------------------------------------
--- Impala table statistics - Needed to make the tables
--- visible for impala
-------------------------------------------------------
-------------------------------------------------------
-
-INVALIDATE METADATA ${stats_db_name};
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
new file mode 100644
index 00000000..57acb2ee
--- /dev/null
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
@@ -0,0 +1,18 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+ rm -Rf "$link_folder"
+ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export SHADOW=$2
+
+echo "Updating shadow database"
+impala-shell -d ${SOURCE} -q "invalidate metadata"
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
+impala-shell -q "create database if not exists ${SHADOW}"
+impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+echo "Shadow db ready!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
index 6788f88b..c28be50d 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@@ -1,4 +1,10 @@
-#!/usr/bin/env bash
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+ rm -Rf "$link_folder"
+ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
CONTEXT_API=$1
TARGET_DB=$2
@@ -20,12 +26,13 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP}
hdfs dfs -chmod -R 777 ${TMP}
echo "Creating and populating impala tables"
-impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
-impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
-impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
+impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
+impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
+impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
+impala-shell -d ${TARGET_DB} -q "invalidate metadata"
+impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
+impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
+impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
echo "Cleaning up"
hdfs dfs -rm -f -r -skipTrash ${TMP}
--
2.17.1
From 236435b47010ea1ab94c3f018dcf278f5d2c44aa Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Fri, 12 Mar 2021 14:11:21 +0200
Subject: [PATCH 6/8] following redirects
---
.../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
index c28be50d..29b225e3 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@@ -12,9 +12,9 @@ TARGET_DB=$2
TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
echo "Downloading context data"
-curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
-cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
-cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+curl -L ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
--
2.17.1
From 60ebdf2dbe704733809f401df70bffcf49cede29 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Fri, 12 Mar 2021 16:34:53 +0200
Subject: [PATCH 7/8] update promote wf to support monitor&production
---
.../oa/graph/stats/oozie_app/impala-shell.sh | 18 --
.../scripts/updateProductionViews.sql | 207 ------------------
2 files changed, 225 deletions(-)
delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
deleted file mode 100644
index 70112dc7..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
-if ! [ -L $link_folder ]
-then
- rm -Rf "$link_folder"
- ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
-fi
-
-echo "Getting file from " $3
-hdfs dfs -copyToLocal $3
-
-echo "Running impala shell make the new database visible"
-impala-shell -q "INVALIDATE METADATA;"
-
-echo "Running impala shell to compute new table stats"
-impala-shell -d $1 -f $2
-echo "Impala shell finished"
-rm $2
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
deleted file mode 100644
index 48f8d58f..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
+++ /dev/null
@@ -1,207 +0,0 @@
-------------------------------------------------------
-------------------------------------------------------
--- Shadow schema table exchange
-------------------------------------------------------
-------------------------------------------------------
-
--- Dropping old views
-DROP VIEW IF EXISTS ${stats_db_production_name}.category;
-DROP VIEW IF EXISTS ${stats_db_production_name}.concept;
-DROP VIEW IF EXISTS ${stats_db_production_name}.context;
-DROP VIEW IF EXISTS ${stats_db_production_name}.country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp;
-DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.funder;
-DROP VIEW IF EXISTS ${stats_db_production_name}.fundref;
-DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture;
-DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics;
-
-
--- Creating the shadow database, in case it doesn't exist
-CREATE database IF NOT EXISTS ${stats_db_production_name};
-
--- Creating new views
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
--
2.17.1
From 0ba0a6b9dac25f5ec73e8eafefbf7f91442ad1c5 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Fri, 12 Mar 2021 16:42:59 +0200
Subject: [PATCH 8/8] update promote wf to support monitor&production
---
.../stats/oozie_app/updateProductionViews.sh | 14 +++----
.../dhp/oa/graph/stats/oozie_app/workflow.xml | 37 ++++++++++++-------
2 files changed, 29 insertions(+), 22 deletions(-)
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
index 57acb2ee..3e510e87 100644
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
@@ -7,12 +7,10 @@ then
fi
export SOURCE=$1
-export SHADOW=$2
+export PRODUCTION=$2
-echo "Updating shadow database"
-impala-shell -d ${SOURCE} -q "invalidate metadata"
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
-impala-shell -q "create database if not exists ${SHADOW}"
-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
-echo "Shadow db ready!"
\ No newline at end of file
+echo "Updating ${PRODUCTION} database"
+impala-shell -q "create database if not exists ${PRODUCTION}"
+impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+echo "Production db ready!"
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
index d744f18d..0d8ff7ee 100644
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@@ -6,7 +6,15 @@
</property>
<property>
<name>stats_db_production_name</name>
- <description>the name of the production schema</description>
+ <description>the name of the public production schema</description>
+ </property>
+ <property>
+ <name>monitor_db_name</name>
+ <description>the monitor database name</description>
+ </property>
+ <property>
+ <name>monitor_db_production_name</name>
+ <description>the name of the monitor public database</description>
</property>
<property>
<name>stats_tool_api_url</name>
@@ -48,25 +56,26 @@
</kill>
<action name="updateProductionViews">
- <hive2 xmlns="uri:oozie:hive2-action:0.1">
- <jdbc-url>${hive_jdbc_url}</jdbc-url>
- <script>scripts/updateProductionViews.sql</script>
- <param>stats_db_name=${stats_db_name}</param>
- <param>stats_db_production_name=${stats_db_production_name}</param>
- </hive2>
- <ok to="computeProductionStats"/>
+ <shell xmlns="uri:oozie:shell-action:0.1">
+ <job-tracker>${jobTracker}</job-tracker>
+ <name-node>${nameNode}</name-node>
+ <exec>updateProductionViews.sh</exec>
+ <argument>${stats_db_name}</argument>
+ <argument>${stats_db_production_name}</argument>
+ <file>updateProductionViews.sh</file>
+ </shell>
+ <ok to="updateMonitorViews"/>
<error to="Kill"/>
</action>
- <action name="computeProductionStats">
+ <action name="updateMonitorViews">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
- <exec>impala-shell.sh</exec>
- <argument>${stats_db_production_name}</argument>
- <argument>computeProductionStats.sql</argument>
- <argument>${wf:appPath()}/scripts/computeProductionStats.sql</argument>
- <file>impala-shell.sh</file>
+ <exec>updateProductionViews.sh</exec>
+ <argument>${monitor_db_name}</argument>
+ <argument>${monitor_db_production_name}</argument>
+ <file>updateProductionViews.sh</file>
</shell>
<ok to="promoteCache"/>
<error to="Kill"/>
--
2.17.1

View File

@ -1,43 +0,0 @@
# Contributor Code of Conduct
Openness, transparency and our community-driven participatory approach guide us in our day-to-day interactions and decision-making. Our open source projects are no exception. Trust, respect, collaboration and transparency are core values we believe should live and breathe within our projects. Our community welcomes participants from around the world with different experiences, unique perspectives, and great ideas to share.
## Our Pledge
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment include:
- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Attempting collaboration before conflict
- Focusing on what is best for the community
- Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
- Violence, threats of violence, or inciting others to commit self-harm
- The use of sexualized language or imagery and unwelcome sexual attention or advances
- Trolling, intentionally spreading misinformation, insulting/derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or electronic address, without explicit permission
- Abuse of the reporting process to intentionally harass or exclude others
- Advocating for, or encouraging, any of the above behavior
- Other conduct which could reasonably be considered inappropriate in a professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), [version 1.4](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html).

View File

@ -1,10 +0,0 @@
# Contributing to D-Net Hadoop
:+1::tada: First off, thanks for taking the time to contribute! :tada::+1:
This project and everyone participating in it is governed by our [Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to [dnet-team@isti.cnr.it](mailto:dnet-team@isti.cnr.it).
The following is a set of guidelines for contributing to this project and its packages. These are mostly guidelines, not rules, which applies to this project as a while, including all its sub-modules.
Use your best judgment, and feel free to propose changes to this document in a pull request.
All contributions are welcome, all contributions will be considered to be contributed under the [project license](LICENSE.md).

View File

133
README.md
View File

@ -1,133 +1,2 @@
# dnet-hadoop
Dnet-hadoop is the project that defined all the [OOZIE workflows](https://oozie.apache.org/) for the OpenAIRE Graph construction, processing, provisioning.
This project adheres to the Contributor Covenant [code of conduct](CODE_OF_CONDUCT.md).
By participating, you are expected to uphold this code. Please report unacceptable behavior to [dnet-team@isti.cnr.it](mailto:dnet-team@isti.cnr.it).
This project is licensed under the [AGPL v3 or later version](#LICENSE.md).
How to build, package and run oozie workflows
====================
Oozie-installer is a utility allowing building, uploading and running oozie workflows. In practice, it creates a `*.tar.gz`
package that contains resources that define a workflow and some helper scripts.
This module is automatically executed when running:
`mvn package -Poozie-package -Dworkflow.source.dir=classpath/to/parent/directory/of/oozie_app`
on module having set:
```
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
</parent>
```
in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to
a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory).
The outcome of this packaging is `oozie-package.tar.gz` file containing inside all the resources required to run Oozie workflow:
- jar packages
- workflow definitions
- job properties
- maintenance scripts
Required properties
====================
In order to include proper workflow within package, `workflow.source.dir` property has to be set. It could be provided
by setting `-Dworkflow.source.dir=some/job/dir` maven parameter.
In oder to define full set of cluster environment properties one should create `~/.dhp/application.properties` file with
the following properties:
- `dhp.hadoop.frontend.user.name` - your user name on hadoop cluster and frontend machine
- `dhp.hadoop.frontend.host.name` - frontend host name
- `dhp.hadoop.frontend.temp.dir` - frontend directory for temporary files
- `dhp.hadoop.frontend.port.ssh` - frontend machine ssh port
- `oozieServiceLoc` - oozie service location required by run_workflow.sh script executing oozie job
- `nameNode` - name node address
- `jobTracker` - job tracker address
- `oozie.execution.log.file.location` - location of file that will be created when executing oozie job, it contains output
produced by `run_workflow.sh` script (needed to obtain oozie job id)
- `maven.executable` - mvn command location, requires parameterization due to a different setup of CI cluster
- `sparkDriverMemory` - amount of memory assigned to spark jobs driver
- `sparkExecutorMemory` - amount of memory assigned to spark jobs executors
- `sparkExecutorCores` - number of cores assigned to spark jobs executors
All values will be overriden with the ones from `job.properties` and eventually `job-override.properties` stored in module's
main folder.
When overriding properties from `job.properties`, `job-override.properties` file can be created in main module directory
(the one containing `pom.xml` file) and define all new properties which will override existing properties.
One can provide those properties one by one as command line `-D` arguments.
Properties overriding order is the following:
1. `pom.xml` defined properties (located in the project root dir)
2. `~/.dhp/application.properties` defined properties
3. `${workflow.source.dir}/job.properties`
4. `job-override.properties` (located in the project root dir)
5. `maven -Dparam=value`
where the maven `-Dparam` property is overriding all the other ones.
Workflow definition requirements
====================
`workflow.source.dir` property should point to the following directory structure:
[${workflow.source.dir}]
|
|-job.properties (optional)
|
\-[oozie_app]
|
\-workflow.xml
This property can be set using maven `-D` switch.
`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is
provided with directory name as value.
Sub-workflows are supported as well and sub-workflow directories should be nested within `[oozie_app]` directory.
Creating oozie installer step-by-step
=====================================
Automated oozie-installer steps are the following:
1. creating jar packages: `*.jar` and `*tests.jar` along with copying all dependencies in `target/dependencies`
2. reading properties from maven, `~/.dhp/application.properties`, `job.properties`, `job-override.properties`
3. invoking priming mechanism linking resources from import.txt file (currently resolving subworkflow resources)
4. assembling shell scripts for preparing Hadoop filesystem, uploading Oozie application and starting workflow
5. copying whole `${workflow.source.dir}` content to `target/${oozie.package.file.name}`
6. generating updated `job.properties` file in `target/${oozie.package.file.name}` based on maven,
`~/.dhp/application.properties`, `job.properties` and `job-override.properties`
7. creating `lib` directory (or multiple directories for sub-workflows for each nested directory) and copying jar packages
created at step (1) to each one of them
8. bundling whole `${oozie.package.file.name}` directory into single tar.gz package
Uploading oozie package and running workflow on cluster
=======================================================
In order to simplify deployment and execution process two dedicated profiles were introduced:
- `deploy`
- `run`
to be used along with `oozie-package` profile e.g. by providing `-Poozie-package,deploy,run` maven parameters.
The `deploy` profile supplements packaging process with:
1) uploading oozie-package via scp to `/home/${user.name}/oozie-packages` directory on `${dhp.hadoop.frontend.host.name}` machine
2) extracting uploaded package
3) uploading oozie content to hadoop cluster HDFS location defined in `oozie.wf.application.path` property (generated dynamically by maven build process, based on `${dhp.hadoop.frontend.user.name}` and `workflow.source.dir` properties)
The `run` profile introduces:
1) executing oozie application uploaded to HDFS cluster using `deploy` command. Triggers `run_workflow.sh` script providing runtime properties defined in `job.properties` file.
Notice: ssh access to frontend machine has to be configured on system level and it is preferable to set key-based authentication in order to simplify remote operations.
Dnet-hadoop is a tool for

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.5-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.5-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -8,6 +8,8 @@ import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugin.MojoFailureException;
/**
* Generates oozie properties which were not provided from commandline.
@ -25,7 +27,7 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
};
@Override
public void execute() {
public void execute() throws MojoExecutionException, MojoFailureException {
if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR)
&& !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) {
String generatedSandboxName = generateSandboxName(
@ -44,24 +46,24 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
/**
* Generates sandbox name from workflow source directory.
*
* @param wfSourceDir workflow source directory
* @param wfSourceDir
* @return generated sandbox name
*/
private String generateSandboxName(String wfSourceDir) {
// utilize all dir names until finding one of the limiters
List<String> sandboxNameParts = new ArrayList<>();
List<String> sandboxNameParts = new ArrayList<String>();
String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar);
ArrayUtils.reverse(tokens);
if (tokens.length > 0) {
for (String token : tokens) {
for (String limiter : limiters) {
if (limiter.equals(token)) {
return !sandboxNameParts.isEmpty()
return sandboxNameParts.size() > 0
? StringUtils.join(sandboxNameParts.toArray())
: null;
}
}
if (!sandboxNameParts.isEmpty()) {
if (sandboxNameParts.size() > 0) {
sandboxNameParts.add(0, File.separator);
}
sandboxNameParts.add(0, token);

View File

@ -16,7 +16,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -290,7 +289,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected List<String> getEscapeChars(String escapeChars) {
List<String> tokens = getListFromCSV(escapeChars);
List<String> realTokens = new ArrayList<>();
List<String> realTokens = new ArrayList<String>();
for (String token : tokens) {
String realToken = getRealToken(token);
realTokens.add(realToken);
@ -325,7 +324,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
* @return content
*/
protected String getContent(String comment, Properties properties, List<String> escapeTokens) {
List<String> names = new ArrayList<>(properties.stringPropertyNames());
List<String> names = new ArrayList<String>(properties.stringPropertyNames());
Collections.sort(names);
StringBuilder sb = new StringBuilder();
if (!StringUtils.isBlank(comment)) {
@ -353,7 +352,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
throws MojoExecutionException {
try {
String content = getContent(comment, properties, escapeTokens);
FileUtils.writeStringToFile(file, content, StandardCharsets.UTF_8);
FileUtils.writeStringToFile(file, content, ENCODING_UTF8);
} catch (IOException e) {
throw new MojoExecutionException("Error creating properties file", e);
}
@ -400,9 +399,9 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected static final List<String> getListFromCSV(String csv) {
if (StringUtils.isBlank(csv)) {
return new ArrayList<>();
return new ArrayList<String>();
}
List<String> list = new ArrayList<>();
List<String> list = new ArrayList<String>();
String[] tokens = StringUtils.split(csv, ",");
for (String token : tokens) {
list.add(token.trim());

View File

@ -9,18 +9,18 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
/** @author mhorst, claudio.atzori */
class GenerateOoziePropertiesMojoTest {
public class GenerateOoziePropertiesMojoTest {
private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@BeforeEach
void clearSystemProperties() {
public void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
}
@Test
void testExecuteEmpty() throws Exception {
public void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -29,7 +29,7 @@ class GenerateOoziePropertiesMojoTest {
}
@Test
void testExecuteSandboxNameAlreadySet() throws Exception {
public void testExecuteSandboxNameAlreadySet() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
String sandboxName = "originalSandboxName";
@ -44,7 +44,7 @@ class GenerateOoziePropertiesMojoTest {
}
@Test
void testExecuteEmptyWorkflowSourceDir() throws Exception {
public void testExecuteEmptyWorkflowSourceDir() throws Exception {
// given
String workflowSourceDir = "";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -57,7 +57,7 @@ class GenerateOoziePropertiesMojoTest {
}
@Test
void testExecuteNullSandboxNameGenerated() throws Exception {
public void testExecuteNullSandboxNameGenerated() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -70,7 +70,7 @@ class GenerateOoziePropertiesMojoTest {
}
@Test
void testExecute() throws Exception {
public void testExecute() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -83,7 +83,7 @@ class GenerateOoziePropertiesMojoTest {
}
@Test
void testExecuteWithoutRoot() throws Exception {
public void testExecuteWithoutRoot() throws Exception {
// given
String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

View File

@ -20,7 +20,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
/** @author mhorst, claudio.atzori */
@ExtendWith(MockitoExtension.class)
class WritePredefinedProjectPropertiesTest {
public class WritePredefinedProjectPropertiesTest {
@Mock
private MavenProject mavenProject;
@ -39,7 +39,7 @@ class WritePredefinedProjectPropertiesTest {
// ----------------------------------- TESTS ---------------------------------------------
@Test
void testExecuteEmpty() throws Exception {
public void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -50,7 +50,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithProjectProperties() throws Exception {
public void testExecuteWithProjectProperties() throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -70,7 +70,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test()
void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -80,19 +80,11 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder;
// execute
try {
mojo.execute();
Assertions.assertTrue(false); // not reached
} catch (Exception e) {
Assertions
.assertTrue(
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
}
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -116,7 +108,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -140,7 +132,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -172,7 +164,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -202,7 +194,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromBlankLocation() {
public void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -222,7 +214,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -255,7 +247,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -281,7 +273,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given
mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {
@ -298,7 +290,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteIncludingPropertyKeysFromInvalidFile() {
public void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given
mojo.setIncludePropertyKeysFromFiles(new String[] {
"invalid location"
@ -309,7 +301,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given
mojo.setIncludeEnvironmentVariables(true);
@ -326,7 +318,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey";
String value = "systemPropertyValue";
@ -345,7 +337,7 @@ class WritePredefinedProjectPropertiesTest {
}
@Test
void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
throws Exception {
// given
String key = "systemPropertyKey ";

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.5-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
<packaging>jar</packaging>
@ -22,20 +22,9 @@
<id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement>
<build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement>
<plugins>
<plugin>
@ -46,19 +35,14 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version>
<configuration>
<skip>true</skip>
</configuration>
<version>3.7.1</version>
</plugin>
</plugins>
</pluginManagement>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>
</project>

View File

@ -1,21 +0,0 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -4,15 +4,12 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules>
<module>dhp-code-style</module>
@ -20,12 +17,4 @@
<module>dhp-build-properties-maven-plugin</module>
</modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project>

View File

@ -1,22 +0,0 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
@ -13,80 +13,21 @@
<artifactId>dhp-common</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
</dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
@ -148,6 +89,11 @@
<artifactId>okhttp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
@ -161,31 +107,8 @@
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<!-- dependencies required on JDK9+ because J2EE has been removed -->
<profiles>
<profile>
<id>spark-34</id>
<dependencies>
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.2.11</version>
</dependency>
<dependency>
<groupId>com.sun.xml.ws</groupId>
<artifactId>jaxws-ri</artifactId>
<version>2.3.3</version>
<type>pom</type>
</dependency>
</dependencies>
</profile>
</profiles>
</project>

View File

@ -0,0 +1,182 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import java.util.UUID;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstores")
public class MDStore implements Serializable {
/** */
private static final long serialVersionUID = 3160530489149700055L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
@Column(name = "hdfs_path")
private String hdfsPath;
@Column(name = "creation_date")
@Temporal(TemporalType.TIMESTAMP)
private Date creationDate;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
public Date getCreationDate() {
return creationDate;
}
public void setCreationDate(final Date creationDate) {
this.creationDate = creationDate;
}
public static MDStore newInstance(
final String format,
final String layout,
final String interpretation,
final String hdfsBasePath) {
return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
}
public static MDStore newInstance(
final String format,
final String layout,
final String interpretation,
final String dsName,
final String dsId,
final String apiId,
final String hdfsBasePath) {
final String mdId = "md-" + UUID.randomUUID();
final MDStore md = new MDStore();
md.setId(mdId);
md.setFormat(format);
md.setLayout(layout);
md.setInterpretation(interpretation);
md.setCreationDate(new Date());
md.setDatasourceName(dsName);
md.setDatasourceId(dsId);
md.setApiId(apiId);
md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
return md;
}
@Override
public String toString() {
return String
.format(
"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStore)) {
return false;
}
final MDStore other = (MDStore) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Table(name = "mdstore_current_versions")
public class MDStoreCurrentVersion implements Serializable {
/** */
private static final long serialVersionUID = -4757725888593745773L;
@Id
@Column(name = "mdstore")
private String mdstore;
@Column(name = "current_version")
private String currentVersion;
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) {
final MDStoreCurrentVersion cv = new MDStoreCurrentVersion();
cv.setMdstore(mdId);
cv.setCurrentVersion(versionId);
return cv;
}
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
return newInstance(v.getMdstore(), v.getId());
}
@Override
public String toString() {
return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
}
@Override
public int hashCode() {
return Objects.hash(currentVersion, mdstore);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreCurrentVersion)) {
return false;
}
final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
}
}

View File

@ -0,0 +1,140 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstore_versions")
public class MDStoreVersion implements Serializable {
/** */
private static final long serialVersionUID = -4763494442274298339L;
@Id
@Column(name = "id")
private String id;
@Column(name = "mdstore")
private String mdstore;
@Column(name = "writing")
private boolean writing;
@Column(name = "readcount")
private int readCount = 0;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
@Column(name = "hdfs_path")
private String hdfsPath;
public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
final MDStoreVersion v = new MDStoreVersion();
final String versionId = mdId + "-" + new Date().getTime();
v.setId(versionId);
v.setMdstore(mdId);
v.setLastUpdate(null);
v.setWriting(writing);
v.setReadCount(0);
v.setSize(0);
v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
return v;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public boolean isWriting() {
return writing;
}
public void setWriting(final boolean writing) {
this.writing = writing;
}
public int getReadCount() {
return readCount;
}
public void setReadCount(final int readCount) {
this.readCount = readCount;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
@Override
public String toString() {
return String
.format(
"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
mdstore, writing, readCount, lastUpdate, size, hdfsPath);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreVersion)) {
return false;
}
final MDStoreVersion other = (MDStoreVersion) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -0,0 +1,194 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstores_with_info")
public class MDStoreWithInfo implements Serializable {
/** */
private static final long serialVersionUID = -8445784770687571492L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
@Column(name = "current_version")
private String currentVersion;
@Column(name = "creation_date")
@Temporal(TemporalType.TIMESTAMP)
private Date creationDate;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
@Column(name = "n_versions")
private long numberOfVersions = 0;
@Column(name = "hdfs_path")
private String hdfsPath;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public Date getCreationDate() {
return creationDate;
}
public void setCreationDate(final Date creationDate) {
this.creationDate = creationDate;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
public long getNumberOfVersions() {
return numberOfVersions;
}
public void setNumberOfVersions(final long numberOfVersions) {
this.numberOfVersions = numberOfVersions;
}
public String getHdfsPath() {
return hdfsPath;
}
public void setHdfsPath(final String hdfsPath) {
this.hdfsPath = hdfsPath;
}
@Override
public String toString() {
return String
.format(
"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
lastUpdate, size, numberOfVersions, hdfsPath);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof MDStoreWithInfo)) {
return false;
}
final MDStoreWithInfo other = (MDStoreWithInfo) obj;
return Objects.equals(id, other.id);
}
}

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.application;
import java.io.*;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import com.google.common.collect.Maps;
public class ApplicationUtils {
}

View File

@ -56,13 +56,13 @@ public class ArgumentApplicationParser implements Serializable {
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (IOException e) {
log.error("Wrong value to decompress: {}", abstractCompressed);
throw new IllegalArgumentException(e);
} catch (Throwable e) {
log.error("Wrong value to decompress:" + abstractCompressed);
throw new RuntimeException(e);
}
}
public static String compressArgument(final String value) throws IOException {
public static String compressArgument(final String value) throws Exception {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());

View File

@ -9,6 +9,9 @@ public class OptionsParameter {
private boolean paramRequired;
private boolean compressed;
public OptionsParameter() {
}
public String getParamName() {
return paramName;
}

View File

@ -34,7 +34,7 @@ public class ApiDescriptor {
return params;
}
public void setParams(final Map<String, String> params) {
public void setParams(final HashMap<String, String> params) {
this.params = params;
}

View File

@ -10,17 +10,8 @@ public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static final String ROR_NS_PREFIX = "ror_________";
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
public static final String ROR_DATASOURCE_NAME = "Research Organization Registry (ROR)";
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
private Constants() {
}
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
@ -51,18 +42,9 @@ public class Constants {
public static final String RETRY_DELAY = "retryDelay";
public static final String CONNECT_TIMEOUT = "connectTimeOut";
public static final String READ_TIMEOUT = "readTimeOut";
public static final String REQUEST_METHOD = "requestMethod";
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
public static final String CONTENT_TOTALITEMS = "TotalItems";
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
// IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages
// see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset";
}

View File

@ -7,14 +7,14 @@ import java.sql.*;
import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class DbClient implements Closeable {
private static final Logger log = LoggerFactory.getLogger(DbClient.class);
private static final Log log = LogFactory.getLog(DbClient.class);
private final Connection connection;
private Connection connection;
public DbClient(final String address, final String login, final String password) {
@ -37,8 +37,6 @@ public class DbClient implements Closeable {
try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100);
log.info("running SQL:\n\n{}\n\n", sql);
try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
consumer.accept(rs);

View File

@ -0,0 +1,412 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class GraphResultMapper implements Serializable {
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in) {
CommunityResult out = new CommunityResult();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
c.setConferencedate(j.getConferencedate());
c.setConferenceplace(j.getConferenceplace());
c.setEdition(j.getEdition());
c.setEp(j.getEp());
c.setIss(j.getIss());
c.setIssnLinking(j.getIssnLinking());
c.setIssnOnline(j.getIssnOnline());
c.setIssnPrinted(j.getIssnPrinted());
c.setName(j.getName());
c.setSp(j.getSp());
c.setVol(j.getVol());
out.setContainer(c);
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
}
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out
.setGeolocation(
Optional
.ofNullable(id.getGeolocation())
.map(
igl -> igl
.stream()
.filter(Objects::nonNull)
.map(gli -> {
GeoLocation gl = new GeoLocation();
gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
break;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
Optional
.ofNullable(is.getCodeRepositoryUrl())
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
Optional
.ofNullable(is.getDocumentationUrl())
.ifPresent(
value -> out
.setDocumentationUrl(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
break;
case "other":
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
out
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
break;
}
Optional
.ofNullable(input.getAuthor())
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
out
.setBestaccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
Country country = new Country();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(input.getOriginalId());
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
out
.setInstance(
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
}
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
}
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
List<ControlledField> pids = new ArrayList<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.stream()
.forEach(
p -> pids
.add(
ControlledField
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
out.setPid(pids);
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
List<String> sourceList = new ArrayList<>();
Optional
.ofNullable(input.getSource())
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<Subject> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.forEach(s -> subjectList.add(getSubject(s))));
out.setSubjects(subjectList);
out.setType(input.getResulttype().getClassid());
}
out
.setCollectedfrom(
input
.getCollectedfrom()
.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
.collect(Collectors.toList()));
return out;
}
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright());
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
}
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
p.setProvenance(di.get().getProvenanceaction().getClassname());
p.setTrust(di.get().getTrust());
subject.setProvenance(p);
}
return subject;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional<List<StructuredProperty>> oPids = Optional
.ofNullable(oa.getPid());
if (oPids.isPresent()) {
Pid pid = getOrcid(oPids.get());
if (pid != null) {
a.setPid(pid);
}
}
return a;
}
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue()),
Provenance
.newInstance(
di.get().getProvenanceaction().getClassname(),
di.get().getTrust()));
} else {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue())
);
}
}
}
return null;
}
}

View File

@ -28,7 +28,7 @@ public class HdfsSupport {
* @param configuration Configuration of hadoop env
*/
public static boolean exists(String path, Configuration configuration) {
logger.info("Checking existence for path: {}", path);
logger.info("Removing path: {}", path);
return rethrowAsRuntimeException(
() -> {
Path f = new Path(path);

View File

@ -1,100 +0,0 @@
package eu.dnetlib.dhp.common;
/**
* This utility represent the Metadata Store information
* needed during the migration from mongo to HDFS to store
*/
public class MDStoreInfo {
private String mdstore;
private String currentId;
private Long latestTimestamp;
/**
* Instantiates a new Md store info.
*/
public MDStoreInfo() {
}
/**
* Instantiates a new Md store info.
*
* @param mdstore the mdstore
* @param currentId the current id
* @param latestTimestamp the latest timestamp
*/
public MDStoreInfo(String mdstore, String currentId, Long latestTimestamp) {
this.mdstore = mdstore;
this.currentId = currentId;
this.latestTimestamp = latestTimestamp;
}
/**
* Gets mdstore.
*
* @return the mdstore
*/
public String getMdstore() {
return mdstore;
}
/**
* Sets mdstore.
*
* @param mdstore the mdstore
* @return the mdstore
*/
public MDStoreInfo setMdstore(String mdstore) {
this.mdstore = mdstore;
return this;
}
/**
* Gets current id.
*
* @return the current id
*/
public String getCurrentId() {
return currentId;
}
/**
* Sets current id.
*
* @param currentId the current id
* @return the current id
*/
public MDStoreInfo setCurrentId(String currentId) {
this.currentId = currentId;
return this;
}
/**
* Gets latest timestamp.
*
* @return the latest timestamp
*/
public Long getLatestTimestamp() {
return latestTimestamp;
}
/**
* Sets latest timestamp.
*
* @param latestTimestamp the latest timestamp
* @return the latest timestamp
*/
public MDStoreInfo setLatestTimestamp(Long latestTimestamp) {
this.latestTimestamp = latestTimestamp;
return this;
}
@Override
public String toString() {
return "MDStoreInfo{" +
"mdstore='" + mdstore + '\'' +
", currentId='" + currentId + '\'' +
", latestTimestamp=" + latestTimestamp +
'}';
}
}

View File

@ -5,100 +5,47 @@ import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MakeTarArchive implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTarArchive.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
final int gBperSplit = Optional
.ofNullable(parser.get("splitSize"))
.map(Integer::valueOf)
.orElse(10);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
}
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
RemoteIterator<LocatedFileStatus> dirIterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dirIterator.hasNext()) {
LocatedFileStatus fileStatus = dirIterator.next();
Path p = fileStatus.getPath();
String pathString = p.toString();
String entity = pathString.substring(pathString.lastIndexOf("/") + 1);
MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit);
}
}
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
fsDataOutputStream = fileSystem.create(hdfsWritePath);
return new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName)
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
try (TarArchiveOutputStream ar = new TarArchiveOutputStream(
fileSystem.create(hdfsWritePath).getWrappedStream())) {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
RemoteIterator<LocatedFileStatus> iterator = fileSystem
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (iterator.hasNext()) {
writeCurrentFile(fileSystem, dirName, iterator, ar, 0);
while (fileStatusListIterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, 0);
}
}
ar.close();
}
public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,
@ -117,40 +64,35 @@ public class MakeTarArchive implements Serializable {
new Path(inputPath), true);
boolean next = fileStatusListIterator.hasNext();
while (next) {
try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) {
TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar");
long currentSize = 0;
while (next && currentSize < bytesPerSplit) {
currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize);
long current_size = 0;
while (next && current_size < bytesPerSplit) {
current_size = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, current_size);
next = fileStatusListIterator.hasNext();
}
partNum += 1;
}
}
}
ar.close();
}
private static long writeCurrentFile(FileSystem fileSystem, String dirName,
}
}
private static long writeCurrentFile(FileSystem fileSystem, String dir_name,
RemoteIterator<LocatedFileStatus> fileStatusListIterator,
TarArchiveOutputStream ar, long currentSize) throws IOException {
TarArchiveOutputStream ar, long current_size) throws IOException {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String pString = p.toString();
if (!pString.endsWith("_SUCCESS")) {
String name = pString.substring(pString.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10);
if (name.contains(".")) {
tmp += name.substring(name.indexOf("."));
}
name = tmp;
}
TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name);
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen());
currentSize += fileStatus.getLen();
current_size += fileStatus.getLen();
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
@ -158,7 +100,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte[] data = new byte[1024];
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
@ -166,7 +108,7 @@ public class MakeTarArchive implements Serializable {
ar.closeArchiveEntry();
}
return currentSize;
return current_size;
}
}

View File

@ -1,15 +1,17 @@
package eu.dnetlib.dhp.common;
import static com.mongodb.client.model.Sorts.descending;
import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -19,7 +21,6 @@ import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.QueryBuilder;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
@ -38,26 +39,6 @@ public class MdstoreClient implements Closeable {
this.db = getDb(client, dbName);
}
private Long parseTimestamp(Document f) {
if (f == null || !f.containsKey("timestamp"))
return null;
Object ts = f.get("timestamp");
return Long.parseLong(ts.toString());
}
public Long getLatestTimestamp(final String collectionId) {
MongoCollection<Document> collection = db.getCollection(collectionId);
FindIterable<Document> result = collection.find().sort(descending("timestamp")).limit(1);
if (result == null) {
return null;
}
Document f = result.first();
return parseTimestamp(f);
}
public MongoCollection<Document> mdStore(final String mdId) {
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
@ -65,7 +46,7 @@ public class MdstoreClient implements Closeable {
final String currentId = Optional
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
.map(FindIterable::first)
.map(r -> r.first())
.map(d -> d.getString("currentId"))
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
@ -74,16 +55,6 @@ public class MdstoreClient implements Closeable {
return getColl(db, currentId, true);
}
public List<MDStoreInfo> mdStoreWithTimestamp(final String mdFormat, final String mdLayout,
final String mdInterpretation) {
Map<String, String> res = validCollections(mdFormat, mdLayout, mdInterpretation);
return res
.entrySet()
.stream()
.map(e -> new MDStoreInfo(e.getKey(), e.getValue(), getLatestTimestamp(e.getValue())))
.collect(Collectors.toList());
}
public Map<String, String> validCollections(
final String mdFormat, final String mdLayout, final String mdInterpretation) {
@ -113,7 +84,7 @@ public class MdstoreClient implements Closeable {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new IllegalArgumentException(err);
throw new RuntimeException(err);
}
return client.getDatabase(dbName);
}
@ -126,7 +97,7 @@ public class MdstoreClient implements Closeable {
String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new IllegalArgumentException(err);
throw new RuntimeException(err);
} else {
return null;
}

View File

@ -1,18 +1,18 @@
package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.ctc.wstx.dtd.LargePrefixedNameSet;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
@ -24,24 +24,13 @@ import com.google.common.hash.Hashing;
*/
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles;
static {
try {
particles = new HashSet<>(IOUtils
.readLines(
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private static Set<String> particles = null;
/**
* Capitalizes a string
@ -49,20 +38,29 @@ public class PacePerson {
* @param s the string to capitalize
* @return the input string with capital letter
*/
public static String capitalize(final String s) {
if (particles.contains(s)) {
return s;
}
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
/**
* Adds a dot to a string with length equals to 1
*/
public static String dotAbbreviations(final String s) {
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
/**
* The constructor of the class. It fills the fields of the class basing on the input fullname.
*
@ -131,6 +129,10 @@ public class PacePerson {
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
@ -186,36 +188,17 @@ public class PacePerson {
}
public List<String> getCapitalFirstnames() {
return Optional
.ofNullable(getNameWithAbbreviations())
.map(
name -> name
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
return Lists
.newArrayList(
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
}
public List<String> getCapitalSurname() {
return Optional
.ofNullable(getSurname())
.map(
surname -> surname
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
}
public List<String> getNameWithAbbreviations() {
return Optional
.ofNullable(getName())
.map(
name -> name
.stream()
.map(PacePerson::dotAbbreviations)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
}
public boolean isAccurate() {

View File

@ -1,81 +0,0 @@
package eu.dnetlib.dhp.common.action;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
+
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log.info("running query: {}", QUERY);
log.info("storing results in: {}", hdfsPath);
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
count++;
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
final MasterDuplicate md = new MasterDuplicate();
final String duplicateId = rs.getString("duplicateId");
final String masterId = rs.getString("masterId");
final String masterName = rs.getString("masterName");
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setMasterName(masterName);
return md;
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
private static void writeMap(final MasterDuplicate dm, final BufferedWriter writer) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.common.action.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
public class MasterDuplicate implements Serializable {
private String duplicateId;
private String masterId;
private String masterName;
public String getDuplicateId() {
return duplicateId;
}
public void setDuplicateId(String duplicateId) {
this.duplicateId = duplicateId;
}
public String getMasterId() {
return masterId;
}
public void setMasterId(String masterId) {
this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
}
}

View File

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.common.aggregation;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
private transient MessageSender messageSender;
public AggregatorReport() {
}
public AggregatorReport(MessageSender messageSender) {
this.messageSender = messageSender;
}
public void ongoing(Long current, Long total) {
messageSender.sendMessage(current, total);
}
@Override
public void close() throws IOException {
if (Objects.nonNull(messageSender)) {
log.info("closing report: ");
this.forEach((k, v) -> log.info("{} - {}", k, v));
Map<String, String> m = new HashMap<>();
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
messageSender.sendReport(m);
}
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.common.api;
import java.io.IOException;
import java.io.InputStream;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.internal.Util;
import okio.BufferedSink;
import okio.Okio;
import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
return new InputStreamRequestBody(inputStream, mediaType, len);
}
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
this.inputStream = inputStream;
this.mediaType = mediaType;
this.lenght = len;
}
@Override
public MediaType contentType() {
return mediaType;
}
@Override
public long contentLength() {
return lenght;
}
@Override
public void writeTo(BufferedSink sink) throws IOException {
Source source = null;
try {
source = Okio.source(inputStream);
sink.writeAll(source);
} finally {
Util.closeQuietly(source);
}
}
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.common.api;
public class MissingConceptDoiException extends Throwable {
public MissingConceptDoiException(String message) {
super(message);
}
}

View File

@ -0,0 +1,315 @@
package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
public class ZenodoAPIClient implements Serializable {
String urlString;
String bucket;
String deposition_id;
String access_token;
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
public String getUrlString() {
return urlString;
}
public void setUrlString(String urlString) {
this.urlString = urlString;
}
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public void setDeposition_id(String deposition_id) {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) throws IOException {
this.urlString = urlString;
this.access_token = access_token;
}
/**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code
* @throws IOException
*/
public int newDeposition() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
json = response.body().string();
ZenodoModel newSubmission = new Gson().fromJson(json, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return response.code();
}
}
/**
* Upload files in Zenodo.
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file
* @return the response code
*/
public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.writeTimeout(600, TimeUnit.SECONDS)
.readTimeout(600, TimeUnit.SECONDS)
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader("Content-Type", "application/zip") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* Associates metadata information to the current deposition
*
* @param metadata the metadata
* @return response code
* @throws IOException
*/
public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.put(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code
* @throws IOException
*/
public int publish() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* for the new version.
*
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id);
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return response.code();
}
}
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Authorization", "Bearer " + access_token)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return response.code();
}
}
private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
deposition_id = zm.getId();
return;
}
}
throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions");
}
private String getPrevDepositions() throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.get()
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.body().string();
}
}
private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder()
.url(url)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.get()
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
return zenodoModel.getLinks().getBucket();
}
}
}

View File

@ -1,39 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
public class CategorySummary {
private String id;
private String label;
private boolean hasConcept;
public String getId() {
return id;
}
public String getLabel() {
return label;
}
public boolean isHasConcept() {
return hasConcept;
}
public CategorySummary setId(final String id) {
this.id = id;
return this;
}
public CategorySummary setLabel(final String label) {
this.label = label;
return this;
}
public CategorySummary setHasConcept(final boolean hasConcept) {
this.hasConcept = hasConcept;
return this;
}
}

View File

@ -1,7 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
import java.util.ArrayList;
public class CategorySummaryList extends ArrayList<CategorySummary> {
}

View File

@ -1,52 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
import java.util.List;
public class ConceptSummary {
private String id;
private String label;
public boolean hasSubConcept;
private List<ConceptSummary> concepts;
public String getId() {
return id;
}
public String getLabel() {
return label;
}
public List<ConceptSummary> getConcepts() {
return concepts;
}
public ConceptSummary setId(final String id) {
this.id = id;
return this;
}
public ConceptSummary setLabel(final String label) {
this.label = label;
return this;
}
public boolean isHasSubConcept() {
return hasSubConcept;
}
public ConceptSummary setHasSubConcept(final boolean hasSubConcept) {
this.hasSubConcept = hasSubConcept;
return this;
}
public ConceptSummary setConcept(final List<ConceptSummary> concepts) {
this.concepts = concepts;
return this;
}
}

View File

@ -1,7 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
import java.util.ArrayList;
public class ConceptSummaryList extends ArrayList<ConceptSummary> {
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
public class ContextSummary {
private String id;
private String label;
private String type;
private String status;
public String getId() {
return id;
}
public String getLabel() {
return label;
}
public String getType() {
return type;
}
public String getStatus() {
return status;
}
public ContextSummary setId(final String id) {
this.id = id;
return this;
}
public ContextSummary setLabel(final String label) {
this.label = label;
return this;
}
public ContextSummary setType(final String type) {
this.type = type;
return this;
}
public ContextSummary setStatus(final String status) {
this.status = status;
return this;
}
}

View File

@ -1,7 +0,0 @@
package eu.dnetlib.dhp.common.api.context;
import java.util.ArrayList;
public class ContextSummaryList extends ArrayList<ContextSummary> {
}

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Community {
private String identifier;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Creator {
private String affiliation;
private String name;
private String orcid;
public String getAffiliation() {
return affiliation;
}
public void setAffiliation(String affiliation) {
this.affiliation = affiliation;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (!(name == null)) {
c.name = name;
}
if (!(affiliation == null)) {
c.affiliation = affiliation;
}
if (!(orcid == null)) {
c.orcid = orcid;
}
return c;
}
}

View File

@ -0,0 +1,58 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import net.minidev.json.annotate.JsonIgnore;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
@JsonIgnore
// private Links links;
public String getChecksum() {
return checksum;
}
public void setChecksum(String checksum) {
this.checksum = checksum;
}
public String getFilename() {
return filename;
}
public void setFilename(String filename) {
this.filename = filename;
}
public long getFilesize() {
return filesize;
}
public void setFilesize(long filesize) {
this.filesize = filesize;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
// @JsonIgnore
// public Links getLinks() {
// return links;
// }
//
// @JsonIgnore
// public void setLinks(Links links) {
// this.links = links;
// }
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Grant implements Serializable {
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Grant newInstance(String id) {
Grant g = new Grant();
g.id = id;
return g;
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Links implements Serializable {
private String bucket;
private String discard;
private String edit;
private String files;
private String html;
private String latest_draft;
private String latest_draft_html;
private String publish;
private String self;
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public String getDiscard() {
return discard;
}
public void setDiscard(String discard) {
this.discard = discard;
}
public String getEdit() {
return edit;
}
public void setEdit(String edit) {
this.edit = edit;
}
public String getFiles() {
return files;
}
public void setFiles(String files) {
this.files = files;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getLatest_draft() {
return latest_draft;
}
public void setLatest_draft(String latest_draft) {
this.latest_draft = latest_draft;
}
public String getLatest_draft_html() {
return latest_draft_html;
}
public void setLatest_draft_html(String latest_draft_html) {
this.latest_draft_html = latest_draft_html;
}
public String getPublish() {
return publish;
}
public void setPublish(String publish) {
this.publish = publish;
}
public String getSelf() {
return self;
}
public void setSelf(String self) {
this.self = self;
}
}

View File

@ -0,0 +1,153 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class Metadata implements Serializable {
private String access_right;
private List<Community> communities;
private List<Creator> creators;
private String description;
private String doi;
private List<Grant> grants;
private List<String> keywords;
private String language;
private String license;
private PrereserveDoi prereserve_doi;
private String publication_date;
private List<String> references;
private List<RelatedIdentifier> related_identifiers;
private String title;
private String upload_type;
private String version;
public String getUpload_type() {
return upload_type;
}
public void setUpload_type(String upload_type) {
this.upload_type = upload_type;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public List<Community> getCommunities() {
return communities;
}
public void setCommunities(List<Community> communities) {
this.communities = communities;
}
public List<Creator> getCreators() {
return creators;
}
public void setCreators(List<Creator> creators) {
this.creators = creators;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public List<Grant> getGrants() {
return grants;
}
public void setGrants(List<Grant> grants) {
this.grants = grants;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getLicense() {
return license;
}
public void setLicense(String license) {
this.license = license;
}
public PrereserveDoi getPrereserve_doi() {
return prereserve_doi;
}
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
this.prereserve_doi = prereserve_doi;
}
public String getPublication_date() {
return publication_date;
}
public void setPublication_date(String publication_date) {
this.publication_date = publication_date;
}
public List<String> getReferences() {
return references;
}
public void setReferences(List<String> references) {
this.references = references;
}
public List<RelatedIdentifier> getRelated_identifiers() {
return related_identifiers;
}
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
this.related_identifiers = related_identifiers;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class PrereserveDoi implements Serializable {
private String doi;
private String recid;
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getRecid() {
return recid;
}
public void setRecid(String recid) {
this.recid = recid;
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class RelatedIdentifier implements Serializable {
private String identifier;
private String relation;
private String resource_type;
private String scheme;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelation() {
return relation;
}
public void setRelation(String relation) {
this.relation = relation;
}
public String getResource_type() {
return resource_type;
}
public void setResource_type(String resource_type) {
this.resource_type = resource_type;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class ZenodoModel implements Serializable {
private String conceptrecid;
private String created;
private List<File> files;
private String id;
private Links links;
private Metadata metadata;
private String modified;
private String owner;
private String record_id;
private String state;
private boolean submitted;
private String title;
public String getConceptrecid() {
return conceptrecid;
}
public void setConceptrecid(String conceptrecid) {
this.conceptrecid = conceptrecid;
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public List<File> getFiles() {
return files;
}
public void setFiles(List<File> files) {
this.files = files;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Links getLinks() {
return links;
}
public void setLinks(Links links) {
this.links = links;
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public String getModified() {
return modified;
}
public void setModified(String modified) {
this.modified = modified;
}
public String getOwner() {
return owner;
}
public void setOwner(String owner) {
this.owner = owner;
}
public String getRecord_id() {
return record_id;
}
public void setRecord_id(String record_id) {
this.record_id = record_id;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public boolean isSubmitted() {
return submitted;
}
public void setSubmitted(boolean submitted) {
this.submitted = submitted;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.util.ArrayList;
public class ZenodoModelList extends ArrayList<ZenodoModel> {
}

View File

@ -1,32 +0,0 @@
package eu.dnetlib.dhp.common.collection;
public class CollectorException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public CollectorException() {
super();
}
public CollectorException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public CollectorException(final String message, final Throwable cause) {
super(message, cause);
}
public CollectorException(final String message) {
super(message);
}
public CollectorException(final Throwable cause) {
super(cause);
}
}

View File

@ -1,40 +0,0 @@
package eu.dnetlib.dhp.common.collection;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class DecompressTarGz {
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(inputFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
try (
FSDataOutputStream out = fs
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
}
}
}
}
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.dhp.common.collection;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.bean.CsvToBeanBuilder;
public class GetCSV {
public static final char DEFAULT_DELIMITER = ',';
private GetCSV() {
}
public static void getCsv(FileSystem fileSystem, BufferedReader reader, String hdfsPath,
String modelClass) throws IOException, ClassNotFoundException {
getCsv(fileSystem, reader, hdfsPath, modelClass, DEFAULT_DELIMITER);
}
public static void getCsv(FileSystem fileSystem, Reader reader, String hdfsPath,
String modelClass, char delimiter) throws IOException, ClassNotFoundException {
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8))) {
final ObjectMapper mapper = new ObjectMapper();
@SuppressWarnings("unchecked")
final List lines = new CsvToBeanBuilder(reader)
.withType(Class.forName(modelClass))
.withSeparator(delimiter)
.build()
.parse();
for (Object line : lines) {
writer.write(mapper.writeValueAsString(line));
writer.newLine();
}
}
}
}

View File

@ -1,127 +0,0 @@
package eu.dnetlib.dhp.common.collection;
import java.util.HashMap;
import java.util.Map;
/**
* Bundles the http connection parameters driving the client behaviour.
*/
public class HttpClientParams {
// Defaults
public static int _maxNumberOfRetry = 3;
public static int _requestDelay = 0; // milliseconds
public static int _retryDelay = 10; // seconds
public static int _connectTimeOut = 10; // seconds
public static int _readTimeOut = 30; // seconds
public static String _requestMethod = "GET";
/**
* Maximum number of allowed retires before failing
*/
private int maxNumberOfRetry;
/**
* Delay between request (Milliseconds)
*/
private int requestDelay;
/**
* Time to wait after a failure before retrying (Seconds)
*/
private int retryDelay;
/**
* Connect timeout (Seconds)
*/
private int connectTimeOut;
/**
* Read timeout (Seconds)
*/
private int readTimeOut;
/**
* Custom http headers
*/
private Map<String, String> headers;
/**
* Request method (i.e., GET, POST etc)
*/
private String requestMethod;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut, new HashMap<>(),
_requestMethod);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
int readTimeOut, Map<String, String> headers, String requestMethod) {
this.maxNumberOfRetry = maxNumberOfRetry;
this.requestDelay = requestDelay;
this.retryDelay = retryDelay;
this.connectTimeOut = connectTimeOut;
this.readTimeOut = readTimeOut;
this.headers = headers;
this.requestMethod = requestMethod;
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getRequestDelay() {
return requestDelay;
}
public void setRequestDelay(int requestDelay) {
this.requestDelay = requestDelay;
}
public int getRetryDelay() {
return retryDelay;
}
public void setRetryDelay(int retryDelay) {
this.retryDelay = retryDelay;
}
public void setConnectTimeOut(int connectTimeOut) {
this.connectTimeOut = connectTimeOut;
}
public int getConnectTimeOut() {
return connectTimeOut;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(int readTimeOut) {
this.readTimeOut = readTimeOut;
}
public Map<String, String> getHeaders() {
return headers;
}
public void setHeaders(Map<String, String> headers) {
this.headers = headers;
}
public String getRequestMethod() {
return requestMethod;
}
public void setRequestMethod(String requestMethod) {
this.requestMethod = requestMethod;
}
}

View File

@ -1,309 +0,0 @@
package eu.dnetlib.dhp.common.collection;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.HttpHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
/**
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
*
* @author jochen, michele, andrea, alessia, claudio, andreas
*/
public class HttpConnector2 {
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
private static final String REPORT_PREFIX = "http:";
private HttpClientParams clientParams;
private String responseType = null;
private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
}
public HttpConnector2(HttpClientParams clientParams) {
this.clientParams = clientParams;
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
return IOUtils.toInputStream(getInputSource(requestUrl));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public String getInputSource(final String requestUrl) throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @param report the list of errors
* @return the content of the downloaded resource
* @throws CollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl, AggregatorReport report)
throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, report);
}
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException {
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
return IOUtils.toString(s);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException, IOException {
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
final String msg = String
.format(
"Max number of retries (%s/%s) exceeded, failing.",
retryNumber, getClientParams().getMaxNumberOfRetry());
log.error(msg);
throw new CollectorException(msg);
}
InputStream input = null;
long start = System.currentTimeMillis();
try {
if (getClientParams().getRequestDelay() > 0) {
backoffAndSleep(getClientParams().getRequestDelay());
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
urlConn.setRequestMethod(getClientParams().getRequestMethod());
// if provided, add custom headers
if (!getClientParams().getHeaders().isEmpty()) {
for (Map.Entry<String, String> headerEntry : getClientParams().getHeaders().entrySet()) {
urlConn.addRequestProperty(headerEntry.getKey(), headerEntry.getValue());
}
}
logHeaderFields(urlConn);
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
String rateLimit = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT);
String rateRemaining = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING);
if ((rateLimit != null) && (rateRemaining != null) && (Integer.parseInt(rateRemaining) < 2)) {
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
} else {
backoffAndSleep(1000);
}
}
if (is2xx(urlConn.getResponseCode())) {
return getInputStream(urlConn, start);
}
if (is3xx(urlConn.getResponseCode())) {
// REDIRECTS
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.info("The requested url has been moved to {}", newUrl);
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String.format("Moved to: %s", newUrl));
logRequestTime(start);
urlConn.disconnect();
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
if (retryAfter > 0) {
log
.warn(
"waiting and repeating request after suggested retry-after {} sec for URL {}",
retryAfter, requestUrl);
backoffAndSleep(retryAfter * 1000);
} else {
log
.warn(
"waiting and repeating request after default delay of {} sec for URL {}",
getClientParams().getRetryDelay(), requestUrl);
backoffAndSleep(retryNumber * getClientParams().getRetryDelay());
}
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
logRequestTime(start);
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, report);
case 422: // UNPROCESSABLE ENTITY
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
log.warn("waiting and repeating request after 10 sec for URL {}", requestUrl);
backoffAndSleep(10000);
urlConn.disconnect();
logRequestTime(start);
try {
return getInputStream(urlConn, start);
} catch (IOException e) {
log
.error(
"server returned 422 and got IOException accessing the response body from URL {}",
requestUrl);
log.error("IOException:", e);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
default:
log.error("gor error {} from URL: {}", urlConn.getResponseCode(), urlConn.getURL());
log.error("response message: {}", urlConn.getResponseMessage());
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
logRequestTime(start);
urlConn.disconnect();
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
}
}
throw new CollectorException(
String
.format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report)));
} catch (MalformedURLException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
}
private InputStream getInputStream(HttpURLConnection urlConn, long start) throws IOException {
InputStream input = urlConn.getInputStream();
responseType = urlConn.getContentType();
logRequestTime(start);
return input;
}
private static void logRequestTime(long start) {
log
.info(
"request time elapsed: {}sec",
TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - start));
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.info("Response: {} - {}", urlConn.getResponseCode(), urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.info(" key: {} - value: {}", e.getKey(), v);
}
}
}
}
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
log.info("I'm going to sleep for {}ms", sleepTimeMs);
try {
Thread.sleep(sleepTimeMs);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty())
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0));
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
}
private boolean is2xx(final int statusCode) {
return statusCode >= 200 && statusCode <= 299;
}
private boolean is4xx(final int statusCode) {
return statusCode >= 400 && statusCode <= 499;
}
private boolean is3xx(final int statusCode) {
return statusCode >= 300 && statusCode <= 399;
}
private boolean is5xx(final int statusCode) {
return statusCode >= 500 && statusCode <= 599;
}
public String getResponseType() {
return responseType;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.common.rest;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
@ -21,22 +21,19 @@ public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static final ObjectMapper mapper = new ObjectMapper();
private DNetRestClient() {
}
private static ObjectMapper mapper = new ObjectMapper();
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws IOException {
public static String doGET(final String url) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws IOException {
public static <V> String doPOST(final String url, V objParam) throws Exception {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
@ -48,12 +45,12 @@ public class DNetRestClient {
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws IOException {
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws IOException {
try (CloseableHttpClient client = HttpClients.createDefault()) {
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log
@ -65,8 +62,8 @@ public class DNetRestClient {
.map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(",")));
return IOUtils.toString(client.execute(r).getEntity().getContent());
}
CloseableHttpResponse response = client.execute(r);
return IOUtils.toString(response.getEntity().getContent());
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {

View File

@ -4,15 +4,14 @@ package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Vocabulary implements Serializable {
@ -47,7 +46,7 @@ public class Vocabulary implements Serializable {
}
public VocabularyTerm getTerm(final String id) {
return Optional.ofNullable(id).map(String::toLowerCase).map(terms::get).orElse(null);
return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
}
protected void addTerm(final String id, final String name) {
@ -63,46 +62,26 @@ public class Vocabulary implements Serializable {
}
public VocabularyTerm getTermBySynonym(final String syn) {
return Optional
.ofNullable(syn)
.map(s -> getTerm(synonyms.get(s.toLowerCase())))
.orElse(null);
return getTerm(synonyms.get(syn.toLowerCase()));
}
public Qualifier getTermAsQualifier(final String termId) {
return getTermAsQualifier(termId, false);
}
public Qualifier getTermAsQualifier(final String termId, boolean strict) {
final VocabularyTerm term = getTerm(termId);
if (Objects.nonNull(term)) {
return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName());
} else if (Objects.isNull(term) && strict) {
if (StringUtils.isBlank(termId)) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return getSynonymAsQualifier(syn, false);
}
public Qualifier getSynonymAsQualifier(final String syn, boolean strict) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId(), strict))
.map(term -> getTermAsQualifier(term.getId()))
.orElse(null);
}
public Qualifier lookup(String id) {
return lookup(id, false);
}
public Qualifier lookup(String id, boolean strict) {
return Optional
.ofNullable(getSynonymAsQualifier(id, strict))
.orElse(getTermAsQualifier(id, strict));
// .orElse(OafMapperUtils.unknown(getId(), getName()));
}
}

View File

@ -7,8 +7,8 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -46,6 +46,7 @@ public class VocabularyGroup implements Serializable {
}
vocs.addTerm(vocId, termId, termName);
// vocs.addSynonyms(vocId, termId, termId);
}
}
@ -57,17 +58,10 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
// vocs.addSynonyms(vocId, termId, termId);
}
}
// add the term names as synonyms
vocs.vocs.values().forEach(voc -> {
voc.getTerms().values().forEach(term -> {
voc.addSynonym(term.getName().toLowerCase(), term.getId());
});
});
return vocs;
}
@ -81,13 +75,6 @@ public class VocabularyGroup implements Serializable {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);
@ -111,7 +98,7 @@ public class VocabularyGroup implements Serializable {
.getTerms()
.values()
.stream()
.map(VocabularyTerm::getId)
.map(t -> t.getId())
.collect(Collectors.toCollection(HashSet::new));
}
@ -135,24 +122,6 @@ public class VocabularyGroup implements Serializable {
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
}
public Qualifier lookupTermBySynonym(final String vocId, final String syn) {
return find(vocId)
.map(
vocabulary -> Optional
.ofNullable(vocabulary.getTerm(syn))
.map(
term -> OafMapperUtils
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
.orElse(
Optional
.ofNullable(vocabulary.getTermBySynonym(syn))
.map(
term -> OafMapperUtils
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
.orElse(null)))
.orElse(null);
}
/**
* getSynonymAsQualifierCaseSensitive
*
@ -185,19 +154,16 @@ public class VocabularyGroup implements Serializable {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::containsKey)
.map(id -> vocs.containsKey(id))
.orElse(false);
}
private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(s -> s.toLowerCase())
.orElseThrow(
() -> new IllegalArgumentException(
String
.format(
"empty vocabulary id for [term:%s, synonym:%s]", termId, syn)));
() -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
Optional
.ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@ -9,8 +10,8 @@ public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L;
public static final String CURRENT_PARAM = "current";
public static final String TOTAL_PARAM = "total";
public static String CURRENT_PARAM = "current";
public static String TOTAL_PARAM = "total";
private MessageType messageType;

View File

@ -34,7 +34,7 @@ public class MessageSender {
private final String workflowId;
private final ExecutorService executorService = Executors.newCachedThreadPool();
private ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;

View File

@ -4,13 +4,13 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.model.Person;
import scala.Tuple2;
@ -19,9 +19,6 @@ public class AuthorMerger {
private static final Double THRESHOLD = 0.95;
private AuthorMerger() {
}
public static List<Author> merge(List<List<Author>> authors) {
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
@ -39,8 +36,7 @@ public class AuthorMerger {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base;
List<Author> enrich;
List<Author> base, enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
@ -66,24 +62,22 @@ public class AuthorMerger {
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base
.stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
.map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList());
@ -120,47 +114,11 @@ public class AuthorMerger {
});
}
public static String normalizeFullName(final String fullname) {
return nfd(fullname)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String authorFieldToBeCompared(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) {
return author.getSurname();
}
if (StringUtils.isNotBlank(author.getFullname())) {
return author.getFullname();
}
return null;
}
public static String pidToComparableString(StructuredProperty pid) {
final String classId = Optional
.ofNullable(pid)
.map(
p -> Optional
.ofNullable(p.getQualifier())
.map(Qualifier::getClassid)
.map(String::toLowerCase)
.orElse(""))
.orElse("");
return Optional
.ofNullable(pid)
.map(StructuredProperty::getValue)
.map(v -> String.join("|", v, classId))
.orElse("");
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
}
public static int countAuthorsPids(List<Author> authors) {
@ -192,7 +150,7 @@ public class AuthorMerger {
}
private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().isEmpty())
if (a == null || a.getPid() == null || a.getPid().size() == 0)
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
@ -201,14 +159,11 @@ public class AuthorMerger {
if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false);
} else {
if (StringUtils.isNotBlank(author.getFullname()))
return new Person(author.getFullname(), false);
else
return new Person("", false);
}
}
public static String normalize(final String s) {
private static String normalize(final String s) {
String[] normalized = nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError

View File

@ -1,194 +0,0 @@
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.when;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
/**
* Groups the graph content by entity identifier to ensure ID uniqueness
*/
public class GroupEntitiesSparkJob {
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
private static final Encoder<OafEntity> OAFENTITY_KRYO_ENC = Encoders.kryo(OafEntity.class);
private ArgumentApplicationParser parser;
public GroupEntitiesSparkJob(ArgumentApplicationParser parser) {
this.parser = parser;
}
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
GroupEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
new GroupEntitiesSparkJob(parser).run(isSparkSessionManaged, isLookupService);
}
public void run(Boolean isSparkSessionManaged, ISLookUpService isLookUpService)
throws ISLookUpException {
String graphInputPath = parser.get("graphInputPath");
log.info("graphInputPath: {}", graphInputPath);
String checkpointPath = parser.get("checkpointPath");
log.info("checkpointPath: {}", checkpointPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
boolean filterInvisible = Boolean.parseBoolean(parser.get("filterInvisible"));
log.info("filterInvisible: {}", filterInvisible);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(checkpointPath, spark.sparkContext().hadoopConfiguration());
groupEntities(spark, graphInputPath, checkpointPath, outputPath, filterInvisible, vocs);
});
}
private static void groupEntities(
SparkSession spark,
String inputPath,
String checkpointPath,
String outputPath,
boolean filterInvisible, VocabularyGroup vocs) {
Dataset<OafEntity> allEntities = spark.emptyDataset(OAFENTITY_KRYO_ENC);
for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) {
String entity = e.getKey().name();
Class<? extends OafEntity> entityClass = e.getValue();
String entityInputPath = inputPath + "/" + entity;
if (!HdfsSupport.exists(entityInputPath, spark.sparkContext().hadoopConfiguration())) {
continue;
}
allEntities = allEntities
.union(
((Dataset<OafEntity>) spark
.read()
.schema(Encoders.bean(entityClass).schema())
.json(entityInputPath)
.filter("length(id) > 0")
.as(Encoders.bean(entityClass)))
.map((MapFunction<OafEntity, OafEntity>) r -> r, OAFENTITY_KRYO_ENC));
}
Dataset<?> groupedEntities = allEntities
.map(
(MapFunction<OafEntity, OafEntity>) entity -> GraphCleaningFunctions
.applyCoarVocabularies(entity, vocs),
OAFENTITY_KRYO_ENC)
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
.map(
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
t.getClass().getName(), t),
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
// pivot on "_1" (classname of the entity)
// created columns containing only entities of the same class
for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) {
String entity = e.getKey().name();
Class<? extends OafEntity> entityClass = e.getValue();
groupedEntities = groupedEntities
.withColumn(
entity,
when(col("_1").equalTo(entityClass.getName()), col("_2")));
}
groupedEntities
.drop("_1", "_2")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.save(checkpointPath);
ForkJoinPool parPool = new ForkJoinPool(ModelSupport.entityTypes.size());
ModelSupport.entityTypes
.entrySet()
.stream()
.map(e -> parPool.submit(() -> {
String entity = e.getKey().name();
Class<? extends OafEntity> entityClass = e.getValue();
spark
.read()
.load(checkpointPath)
.select(col(entity).as("value"))
.filter("value IS NOT NULL")
.as(OAFENTITY_KRYO_ENC)
.map((MapFunction<OafEntity, OafEntity>) r -> r, (Encoder<OafEntity>) Encoders.bean(entityClass))
.filter(filterInvisible ? "dataInfo.invisible != TRUE" : "TRUE")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/" + entity);
}))
.collect(Collectors.toList())
.forEach(t -> {
try {
t.get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
});
}
}

View File

@ -1,77 +0,0 @@
package eu.dnetlib.dhp.oozie;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.text.StringSubstitutor;
import org.apache.spark.SparkConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Resources;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class RunSQLSparkJob {
private static final Logger log = LoggerFactory.getLogger(RunSQLSparkJob.class);
private final ArgumentApplicationParser parser;
public RunSQLSparkJob(ArgumentApplicationParser parser) {
this.parser = parser;
}
public static void main(String[] args) throws Exception {
Map<String, String> params = new HashMap<>();
for (int i = 0; i < args.length - 1; i++) {
if (args[i].startsWith("--")) {
params.put(args[i].substring(2), args[++i]);
}
}
/*
* String jsonConfiguration = IOUtils .toString( Objects .requireNonNull( RunSQLSparkJob.class
* .getResourceAsStream( "/eu/dnetlib/dhp/oozie/run_sql_parameters.json"))); final ArgumentApplicationParser
* parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args);
*/
Boolean isSparkSessionManaged = Optional
.ofNullable(params.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
URL url = com.google.common.io.Resources.getResource(params.get("sql"));
String raw_sql = Resources.toString(url, StandardCharsets.UTF_8);
String sql = StringSubstitutor.replace(raw_sql, params);
log.info("sql: {}", sql);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", params.get("hiveMetastoreUris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
log.info("executing: {}", statement);
long startTime = System.currentTimeMillis();
spark.sql(statement).show();
log
.info(
"executed in {}",
DurationFormatUtils.formatDuration(System.currentTimeMillis() - startTime, "HH:mm:ss.S"));
}
});
}
}

View File

@ -12,9 +12,6 @@ import com.ximpleware.VTDNav;
/** Created by sandro on 9/29/16. */
public class VtdUtilityParser {
private VtdUtilityParser() {
}
public static List<Node> getTextValuesWithAttributes(
final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException {

View File

@ -0,0 +1,334 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.utils.PidBlacklistProvider;
public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10.)";
public static final String DOI_PREFIX = "10.";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
static {
PID_BLACKLIST.add("none");
PID_BLACKLIST.add("na");
}
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.nonNull(o.getCountry())) {
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
if (Objects.nonNull(r.getSubject())) {
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
}
}
if (Objects.nonNull(r.getAuthor())) {
r.getAuthor().forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null);
}
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
r.setPid(processPidCleaning(r.getPid()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids =
pid
.stream()
.filter(Objects::nonNull)
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(Collectors.toCollection(HashSet::new));
Optional.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = altId.stream()
.filter(Objects::nonNull)
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(Collectors.toCollection(HashSet::new));
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
});
});
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i
.setAccessright(
accessRight(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) {
boolean nullRank = r
.getAuthor()
.stream()
.anyMatch(a -> Objects.isNull(a.getRank()));
if (nullRank) {
int i = 1;
for (Author author : r.getAuthor()) {
author.setRank(i++);
}
}
for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a
.setPid(
a
.getPid()
.stream()
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p;
})
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(
Collectors
.toMap(
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
/**
* Utility method that filter PID values on a per-type basis.
* @param s the PID whose value will be checked.
* @return false if the pid matches the filter criteria, true otherwise.
*/
public static boolean pidFilter(StructuredProperty s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) ||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
}
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
return false;
}
return true;
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX));
break;
}
return pid;
}
}

View File

@ -1,70 +0,0 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
public class HashableStructuredProperty extends StructuredProperty {
private static final long serialVersionUID = 8371670185221126045L;
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null) {
return null;
}
final HashableStructuredProperty sp = new HashableStructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
HashableStructuredProperty hsp = new HashableStructuredProperty();
hsp.setQualifier(sp.getQualifier());
hsp.setValue(sp.getValue());
hsp.setQualifier(sp.getQualifier());
return hsp;
}
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
StructuredProperty sp = new StructuredProperty();
sp.setQualifier(hsp.getQualifier());
sp.setValue(hsp.getValue());
sp.setQualifier(hsp.getQualifier());
return sp;
}
@Override
public int hashCode() {
return new HashCodeBuilder(11, 91)
.append(getQualifier().getClassid())
.append(getQualifier().getSchemeid())
.append(getValue())
.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj == this) {
return true;
}
if (obj.getClass() != getClass()) {
return false;
}
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
return new EqualsBuilder()
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
.append(getValue(), rhs.getValue())
.isEquals();
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.schema.oaf;
public class ModelHardLimits {
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";
public static final String SEPARATOR = "-";
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10;
public static String getCollectionName(String format) {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
}

View File

@ -0,0 +1,364 @@
package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right);
} else {
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
}
return left;
}
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (ModelSupport.isSubClass(left, Result.class)) {
return mergeResults((Result) left, (Result) right);
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
((Datasource) left).mergeFrom((Datasource) right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
((Organization) left).mergeFrom((Organization) right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
((Project) left).mergeFrom((Project) right);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
}
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
}
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) {
return null;
}
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo);
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name)
|| StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
public static DataInfo dataInfo(
final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new AccessRightComparator<>());
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
}

View File

@ -0,0 +1,78 @@
package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
HashSet<String> lCf = getCollectedFromIds(left);
HashSet<String> rCf = getCollectedFromIds(right);
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
return -1;
}
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
return 1;
}
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
protected HashSet<String> getCollectedFromIds(Result left) {
return Optional
.ofNullable(left.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(c -> c.getKey())
.collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<>());
}
}

View File

@ -1,46 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.HashSet;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
public static final String DOI_PREFIX = "10.";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
static {
PID_BLACKLIST.add("none");
PID_BLACKLIST.add("na");
}
public CleaningFunctions() {
}
/**
* Utility method that filter PID values on a per-type basis.
* @param s the PID whose value will be checked.
* @return false if the pid matches the filter criteria, true otherwise.
*/
public static boolean pidFilter(StructuredProperty s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) ||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
}
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
}
}

View File

@ -1,30 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import org.apache.commons.lang3.StringUtils;
public class DoiCleaningRule {
public static String clean(final String doi) {
if (doi == null)
return null;
final String replaced = doi
.replaceAll("\\n|\\r|\\t|\\s", "")
.replaceAll("^doi:", "")
.toLowerCase()
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
if (StringUtils.isEmpty(replaced))
return null;
if (!replaced.contains("10."))
return null;
final String ret = replaced.substring(replaced.indexOf("10."));
if (!ret.startsWith(CleaningFunctions.DOI_PREFIX))
return null;
return ret;
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FundRefCleaningRule {
public static final Pattern PATTERN = Pattern.compile("\\d+");
public static String clean(final String fundRefId) {
String s = fundRefId
.toLowerCase()
.replaceAll("\\s", "");
Matcher m = PATTERN.matcher(s);
if (m.find()) {
return m.group();
} else {
return "";
}
}
}

View File

@ -1,24 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GridCleaningRule {
public static final Pattern PATTERN = Pattern.compile("(?<grid>\\d{4,6}\\.[0-9a-z]{1,2})");
public static String clean(String grid) {
String s = grid
.replaceAll("\\s", "")
.toLowerCase();
Matcher m = PATTERN.matcher(s);
if (m.find()) {
return "grid." + m.group("grid");
}
return "";
}
}

View File

@ -1,21 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
// https://www.wikidata.org/wiki/Property:P213
public class ISNICleaningRule {
public static final Pattern PATTERN = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])");
public static String clean(final String isni) {
Matcher m = PATTERN.matcher(isni);
if (m.find()) {
return String.join("", m.group(1), m.group(2), m.group(3), m.group(4));
} else {
return "";
}
}
}

View File

@ -5,14 +5,11 @@ import static com.google.common.base.Preconditions.checkArgument;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.HashBiMap;
@ -20,6 +17,7 @@ import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* Factory class for OpenAIRE identifiers in the Graph
@ -29,11 +27,15 @@ public class IdentifierFactory implements Serializable {
public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|";
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
"(^10\\.1002\\/[^\\s]+$)|" +
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12;
/**
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE.
* The id of the record (source_::id) will be rewritten as pidType_::id)
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE
*/
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
@ -41,8 +43,6 @@ public class IdentifierFactory implements Serializable {
PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "ZENODO");
PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "Zenodo");
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
@ -54,50 +54,10 @@ public class IdentifierFactory implements Serializable {
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create());
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
PID_AUTHORITY.put(PidType.w3id, HashBiMap.create());
PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ROHub");
}
/**
* Declares the associations PID_TYPE -> [DATASOURCE ID, PID SUBSTRING] considered as delegated authority for that
* PID_TYPE. Example, Zenodo is delegated to forge DOIs that contain the 'zenodo' word.
*
* If a record with the same id (same pid) comes from 2 data sources, the one coming from a delegated source wins. E.g. Zenodo records win over those from Datacite.
* See also https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/187 and the class dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
*/
public static final Map<PidType, Map<String, String>> DELEGATED_PID_AUTHORITY = Maps.newHashMap();
static {
DELEGATED_PID_AUTHORITY.put(PidType.doi, new HashMap<>());
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "zenodo");
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "zenodo");
DELEGATED_PID_AUTHORITY.put(PidType.w3id, new HashMap<>());
DELEGATED_PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ro-id");
}
/**
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] whose records are considered enrichment for the graph.
* Their OpenAIRE ID is built from the declared PID type. Are merged with their corresponding record, identified by
* the same OpenAIRE id.
*/
public static final Map<PidType, HashBiMap<String, String>> ENRICHMENT_PROVIDER = Maps.newHashMap();
static {
ENRICHMENT_PROVIDER.put(PidType.doi, HashBiMap.create());
ENRICHMENT_PROVIDER.get(PidType.doi).put(OPEN_APC_ID, OPEN_APC_NAME);
}
public static Set<String> delegatedAuthorityDatasourceIds() {
return DELEGATED_PID_AUTHORITY
.values()
.stream()
.flatMap(m -> m.keySet().stream())
.collect(Collectors.toCollection(HashSet::new));
}
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList());
return pidFromInstance(pid, collectedFrom).distinct().collect(Collectors.toList());
}
public static <T extends Result> String createDOIBoostIdentifier(T entity) {
@ -144,12 +104,12 @@ public class IdentifierFactory implements Serializable {
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
final Map<String, Set<StructuredProperty>> pids = extractPids(entity);
final Map<String, List<StructuredProperty>> pids = extractPids(entity);
return pids
.values()
.stream()
.flatMap(Set::stream)
.flatMap(s -> s.stream())
.min(new PidComparator<>(entity))
.map(
min -> Optional
@ -165,40 +125,40 @@ public class IdentifierFactory implements Serializable {
.orElseGet(entity::getId);
}
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) {
private static <T extends OafEntity> Map<String, List<StructuredProperty>> extractPids(T entity) {
if (entity instanceof Result) {
return Optional
.ofNullable(((Result) entity).getInstance())
.map(IdentifierFactory::mapPids)
.map(
instance -> mapPids(instance))
.orElse(new HashMap<>());
} else {
return entity
.getPid()
.stream()
.map(PidCleaner::normalizePidValue)
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
Collectors.mapping(p -> p, Collectors.toList())));
}
}
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) {
private static Map<String, List<StructuredProperty>> mapPids(List<Instance> instance) {
return instance
.stream()
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false))
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom()))
.flatMap(Function.identity())
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
Collectors.mapping(p -> p, Collectors.toList())));
}
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom,
boolean mapHandles) {
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom) {
return Optional
.ofNullable(pid)
.map(
@ -206,48 +166,24 @@ public class IdentifierFactory implements Serializable {
.stream()
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
.map(PidCleaner::normalizePidValue)
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
.filter(p -> {
return shouldFilterPid(collectedFrom, p);
})
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty());
}
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
if (Objects.isNull(collectedFrom)) {
return false;
}
boolean isEnrich = Optional
.ofNullable(ENRICHMENT_PROVIDER.get(pType))
.map(
enrich -> enrich.containsKey(collectedFrom.getKey())
|| enrich.containsValue(collectedFrom.getValue()))
.orElse(false);
boolean isAuthority = Optional
return pType.equals(PidType.handle) || Optional.ofNullable(collectedFrom).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(
authorities -> authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue()))
.map(authorities -> {
return authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue());
})
.orElse(false);
return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority;
}
private static boolean isNotFromDelegatedAuthority(KeyValue collectedFrom, StructuredProperty p) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
final Map<String, String> da = DELEGATED_PID_AUTHORITY.get(pType);
if (Objects.isNull(da)) {
return true;
}
if (!da.containsKey(collectedFrom.getKey())) {
return true;
}
return StringUtils.contains(p.getValue(), da.get(collectedFrom.getKey()));
}
/**
@ -259,16 +195,12 @@ public class IdentifierFactory implements Serializable {
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
return idFromPid(ModelSupport.getIdPrefix(entity.getClass()), s.getQualifier().getClassid(), s.getValue(), md5);
}
public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) {
return new StringBuilder()
.append(numericPrefix)
.append(ModelSupport.getIdPrefix(entity.getClass()))
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(pidType))
.append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR)
.append(md5 ? md5(pidValue) : pidValue)
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
.toString();
}
@ -281,14 +213,4 @@ public class IdentifierFactory implements Serializable {
return prefix.substring(0, ID_PREFIX_LEN);
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
return null;
}
}
}

View File

@ -1,78 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Optional;
import java.util.stream.Collectors;
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
public class MergeComparator implements Comparator<Oaf> {
public MergeComparator() {
}
public int compare(Oaf left, Oaf right) {
// nulls at the end
if (left == null && right == null) {
return 0;
} else if (left == null) {
return -1;
} else if (right == null) {
return 1;
}
// invisible
if (left.getDataInfo() != null && left.getDataInfo().getInvisible() == true) {
if (right.getDataInfo() != null && right.getDataInfo().getInvisible() == false) {
return -1;
}
}
// collectedfrom
HashSet<String> lCf = getCollectedFromIds(left);
HashSet<String> rCf = getCollectedFromIds(right);
if (lCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
&& !rCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")) {
return -1;
} else if (!lCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
&& rCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")) {
return 1;
}
SubEntityType lClass = SubEntityType.fromClass(left.getClass());
SubEntityType rClass = SubEntityType.fromClass(right.getClass());
return lClass.ordinal() - rClass.ordinal();
}
protected HashSet<String> getCollectedFromIds(Oaf left) {
return (HashSet) Optional.ofNullable(left.getCollectedfrom()).map((cf) -> {
return (HashSet) cf.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}).orElse(new HashSet());
}
enum SubEntityType {
publication, dataset, software, otherresearchproduct, datasource, organization, project;
/**
* Resolves the EntityType, given the relative class name
*
* @param clazz the given class name
* @param <T> actual OafEntity subclass
* @return the EntityType associated to the given class
*/
public static <T extends Oaf> SubEntityType fromClass(Class<T> clazz) {
return valueOf(clazz.getSimpleName().toLowerCase());
}
}
}

View File

@ -1,106 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.*;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
public class MergeEntitiesComparator implements Comparator<Oaf> {
static final List<String> PID_AUTHORITIES = Arrays
.asList(
ModelConstants.ARXIV_ID,
ModelConstants.PUBMED_CENTRAL_ID,
ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
ModelConstants.DATACITE_ID,
ModelConstants.CROSSREF_ID);
static final List<String> RESULT_TYPES = Arrays
.asList(
ModelConstants.ORP_RESULTTYPE_CLASSID,
ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
ModelConstants.DATASET_RESULTTYPE_CLASSID,
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
@Override
public int compare(Oaf left, Oaf right) {
if (left == null && right == null)
return 0;
if (left == null)
return -1;
if (right == null)
return 1;
int res = 0;
// pid authority
int cfp1 = Optional
.ofNullable(left.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
.max(Integer::compare)
.orElse(-1))
.orElse(-1);
int cfp2 = Optional
.ofNullable(right.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
.max(Integer::compare)
.orElse(-1))
.orElse(-1);
if (cfp1 >= 0 && cfp1 > cfp2) {
return 1;
} else if (cfp2 >= 0 && cfp2 > cfp1) {
return -1;
}
// trust
if (left.getDataInfo() != null && right.getDataInfo() != null) {
res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
}
// result type
if (res == 0) {
if (left instanceof Result && right instanceof Result) {
Result r1 = (Result) left;
Result r2 = (Result) right;
if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
return -1;
}
} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
return 1;
}
int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
if (rt1 >= 0 && rt1 > rt2) {
return 1;
} else if (rt2 >= 0 && rt2 > rt1) {
return -1;
}
}
}
// id
if (res == 0) {
if (left instanceof OafEntity && right instanceof OafEntity) {
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
}
}
return res;
}
}

View File

@ -1,27 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
public class ModelHardLimits {
private ModelHardLimits() {
}
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";
public static final String SEPARATOR = "-";
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACTS = 10;
public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
public static final int MAX_INSTANCES = 10;
public static String getCollectionName(String format) {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
}

View File

@ -1,478 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.sql.Array;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils {
private OafMapperUtils() {
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) {
return null;
}
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
public static InstanceTypeMapping instanceTypeMapping(String originalType, String code, String label,
String vocabularyName) {
final InstanceTypeMapping m = new InstanceTypeMapping();
m.setVocabularyName(vocabularyName);
m.setOriginalType(originalType);
m.setTypeCode(code);
m.setTypeLabel(label);
return m;
}
public static InstanceTypeMapping instanceTypeMapping(String originalType, Qualifier term) {
return instanceTypeMapping(originalType, term.getClassid(), term.getClassname(), term.getSchemeid());
}
public static InstanceTypeMapping instanceTypeMapping(String originalType) {
return instanceTypeMapping(originalType, null, null, null);
}
public static InstanceTypeMapping instanceTypeMapping(String originalType, String vocabularyName) {
return instanceTypeMapping(originalType, null, null, vocabularyName);
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo) : null;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
return StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking);
}
public static DataInfo dataInfo(
final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(Instance::getAccessright)
.min(new AccessRightComparator<>());
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
KeyValue kv = new KeyValue();
kv.setDataInfo(dataInfo);
kv.setKey(key);
kv.setValue(value);
return kv;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
return m;
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity,
final String validationDate) {
return getRelation(
source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(),
entity.getLastupdatetimestamp(), validationDate, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp) {
return getRelation(
source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setCollectedfrom(collectedfrom);
rel.setDataInfo(dataInfo);
rel.setLastupdatetimestamp(lastupdatetimestamp);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
}

View File

@ -9,18 +9,10 @@ public class OrganizationPidComparator implements Comparator<StructuredProperty>
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null) {
return right == null ? 0 : -1;
} else if (right == null) {
return 1;
}
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (lClass.equals(PidType.openorgs))
return -1;
if (rClass.equals(PidType.openorgs))

View File

@ -1,21 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PICCleaningRule {
public static final Pattern PATTERN = Pattern.compile("\\d{9}");
public static String clean(final String pic) {
Matcher m = PATTERN.matcher(pic);
if (m.find()) {
return m.group();
} else {
return "";
}
}
}

View File

@ -34,7 +34,4 @@ public class PidBlacklistProvider {
.orElse(new HashSet<>());
}
private PidBlacklistProvider() {
}
}

View File

@ -1,62 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidCleaner {
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {
// TODO add cleaning for more PID types as needed
// Result
case "doi":
return DoiCleaningRule.clean(value);
case "pmid":
return PmidCleaningRule.clean(value);
case "pmc":
return PmcCleaningRule.clean(value);
case "handle":
case "arXiv":
return value;
// Organization
case "GRID":
return GridCleaningRule.clean(value);
case "ISNI":
return ISNICleaningRule.clean(value);
case "ROR":
return RorCleaningRule.clean(value);
case "PIC":
return PICCleaningRule.clean(value);
case "FundRef":
return FundRefCleaningRule.clean(value);
default:
return value;
}
}
}

View File

@ -11,7 +11,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private final T entity;
private T entity;
public PidComparator(T entity) {
this.entity = entity;

View File

@ -5,61 +5,11 @@ import org.apache.commons.lang3.EnumUtils;
public enum PidType {
/**
* The DOI syntax shall be made up of a DOI prefix and a DOI suffix separated by a forward slash.
*
* There is no defined limit on the length of the DOI name, or of the DOI prefix or DOI suffix.
*
* The DOI name is case-insensitive and can incorporate any printable characters from the legal graphic characters
* of Unicode. Further constraints on character use (e.g. use of language-specific alphanumeric characters) can be
* defined for an application by the ISO 26324 Registration Authority.
*
*
* DOI prefix: The DOI prefix shall be composed of a directory indicator followed by a registrant code.
* These two components shall be separated by a full stop (period). The directory indicator shall be "10" and
* distinguishes the entire set of character strings (prefix and suffix) as digital object identifiers within the
* resolution system.
*
* Registrant code: The second element of the DOI prefix shall be the registrant code. The registrant code is a
* unique string assigned to a registrant.
*
* DOI suffix: The DOI suffix shall consist of a character string of any length chosen by the registrant.
* Each suffix shall be unique to the prefix element that precedes it. The unique suffix can be a sequential number,
* or it might incorporate an identifier generated from or based on another system used by the registrant
* (e.g. ISAN, ISBN, ISRC, ISSN, ISTC, ISNI; in such cases, a preferred construction for such a suffix can be
* specified, as in Example 1).
*
* Source: https://www.doi.org/doi_handbook/2_Numbering.html#2.2
*/
doi,
/**
* PubMed Unique Identifier (PMID)
*
* This field is a 1-to-8 digit accession number with no leading zeros. It is present on all records and is the
* accession number for managing and disseminating records. PMIDs are not reused after records are deleted.
*
* Beginning in February 2012 PMIDs include extensions following a decimal point to account for article versions
* (e.g., 21804956.2). All citations are considered version 1 until replaced. The extended PMID is not displayed
* on the MEDLINE format.
*
* View the citation in abstract format in PubMed to access additional versions when available (see the article in
* the Jan-Feb 2012 NLM Technical Bulletin).
*
* Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid
*/
pmid,
/**
* This field contains the unique identifier for the cited article in PubMed Central. The identifier begins with the
* prefix PMC.
*
* Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc
*/
pmc, handle, arXiv, nct, pdb, w3id,
// Result
doi, pmid, pmc, handle, arXiv, nct, pdb,
// Organization
openorgs, ROR, GRID, PIC, ISNI, Wikidata, FundRef, corda, corda_h2020, mag_id, urn,
openorgs, corda, corda_h2020, GRID, mag_id, urn,
// Used by dedup
undefined, original;

View File

@ -4,7 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
public class PidValueComparator implements Comparator<StructuredProperty> {
@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null)
return -1;
StructuredProperty l = PidCleaner.normalizePidValue(left);
StructuredProperty r = PidCleaner.normalizePidValue(right);
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional
.ofNullable(l.getValue())

View File

@ -1,24 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PmcCleaningRule {
public static final Pattern PATTERN = Pattern.compile("PMC\\d{1,8}");
public static String clean(String pmc) {
String s = pmc
.replaceAll("\\s", "")
.toUpperCase();
final Matcher m = PATTERN.matcher(s);
if (m.find()) {
return m.group();
}
return "";
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
public class PmidCleaningRule {
public static final Pattern PATTERN = Pattern.compile("0*(\\d{1,8})");
public static String clean(String pmid) {
String s = pmid
.toLowerCase()
.replaceAll("\\s", "");
final Matcher m = PATTERN.matcher(s);
if (m.find()) {
return m.group(1);
}
return "";
}
}

View File

@ -1,46 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
/**
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
*
* peerReviewed (0001) > nonPeerReviewed (0002) > UNKNOWN (0000)
*/
public class RefereedComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null || left.getClassid() == null) {
return (right == null || right.getClassid() == null) ? 0 : -1;
} else if (right == null || right.getClassid() == null) {
return 1;
}
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass))
return 0;
if ("0001".equals(lClass))
return -1;
if ("0001".equals(rClass))
return 1;
if ("0002".equals(lClass))
return -1;
if ("0002".equals(rClass))
return 1;
if ("0000".equals(lClass))
return -1;
if ("0000".equals(rClass))
return 1;
return 0;
}
}

View File

@ -13,9 +13,6 @@ public class ResultPidComparator implements Comparator<StructuredProperty> {
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (lClass.equals(PidType.doi))
return -1;
if (rClass.equals(PidType.doi))

View File

@ -1,27 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
// https://ror.readme.io/docs/ror-identifier-pattern
public class RorCleaningRule {
public static final String ROR_PREFIX = "https://ror.org/";
private static final Pattern PATTERN = Pattern.compile("(?<ror>0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2})");
public static String clean(String ror) {
String s = ror
.replaceAll("\\s", "")
.toLowerCase();
Matcher m = PATTERN.matcher(s);
if (m.find()) {
return ROR_PREFIX + m.group("ror");
}
return "";
}
}

Some files were not shown because too many files have changed in this diff Show More